scrapy创建规则爬虫CrawlSpider获取读书网书籍信息保存数据库和本地表格,并用ImagePipeline下载封面图
- scrapy genspider -t crawl dubook dushu.com
- # -*- coding: utf-8 -*-
- import scrapy
- from scrapy.linkextractors import LinkExtractor
- from scrapy.spiders import CrawlSpider, Rule
-
-
- class DubookSpider(CrawlSpider):
- name = 'dubook'
- allowed_domains = ['dushu.com']
- start_urls = ['https://www.dushu.com/book/']
-
- rules = (
- # 一级分类链接,采用css样式匹配
- Rule(LinkExtractor(restrict_css='.sub-catalog'), follow=True),
- # 一级分类链接,采用正则匹配
- # Rule(LinkExtractor(allow=r'/book/100\d+?\.html'), follow=True),
- # 测试用
- # Rule(LinkExtractor(allow=r'/book/100[1-2]\.html'), follow=True),
-
- # 下一页链接
- Rule(LinkExtractor(allow=r'/book/100\d+?_\d+?\.html'), callback='parse_book',follow=True),
- # 测试用:1001_1 1001_2 1002_1 1002_2
- # Rule(LinkExtractor(allow=r'/book/100[1-2]_[1-2]\.html'), callback='parse_book',follow=True),
-
- # 书籍详情页链接
- Rule(LinkExtractor(allow=r'/book/\d{5,}/'), callback='parse_item', follow=False),
- )
- def parse_book(self,response):
- next_url = response.url
- next_url = next_url.split('_')[-1].split('.')[0]
- title = response.xpath('//div/div[@class="row"]/div/div/dl[@class="active"]/dt/text()').get()
- print("准备处理[%s]第%s页" % (title, next_url))
- def parse_item(self, response):
- item = {}
- # item['book_name'] = response.xpath('//div[@class="bookslist"]/ul/li/div/h3/a/text()').extract()
- book_name = response.xpath('//div/div/div[@class="book-title"]/h1/text()').get() or "该项为空"
- # 获取书籍面包屑
- navbar = response.xpath('//div[@class="crumbs"]/a[position()>2]/text()').extract()
- length = len(navbar)
- if length == 2:
- print("该书籍一级分类:%s" % book_name)
- item['firstTitle'] = navbar[0]
- item['secondTitle'] = "-"
- item['threeTitle'] = "-"
- item['fourTitle'] = "-"
- elif length == 3:
- print("该书籍二级分类:%s" % book_name)
- item['firstTitle'] = navbar[0]
- item['secondTitle'] = navbar[1]
- item['threeTitle'] = "-"
- item['fourTitle'] = "-"
- elif length == 4:
- print("该书籍三级分类:%s" % book_name)
- item['firstTitle'] = navbar[0]
- item['secondTitle'] = navbar[1]
- item['threeTitle'] = navbar[2]
- item['fourTitle'] = "-"
- elif length == 5:
- print("该书籍四级分类:%s" % book_name)
- item['firstTitle'] = navbar[0]
- item['secondTitle'] = navbar[1]
- item['threeTitle'] = navbar[2]
- item['fourTitle'] = navbar[3]
- else:
- print("该书籍分类异常:%s" % book_name)
- item['firstTitle'] = "-"
- item['secondTitle'] = "-"
- item['threeTitle'] = "-"
- item['fourTitle'] = "-"
-
- # 作者
- book_author = response.xpath(
- '//div/div/div[@class="book-details"]/div/table//tr[1]/td[2]/text()').get() or "该项为空"
- # 标签
- book_tag = response.xpath('//div/div/div[@class="book-details"]/div/table//tr[4]/td[2]/text()').get() or "该项为空"
- # ISBN
- book_isbn = response.xpath('//div/div/div[@class="book-details"]/table//tr[1]/td[2]/text()').get() or "该项为空"
- # 价格
- book_price = response.xpath('//div/div/div[@class="book-details"]/div/p/span/text()').get() or "该项为空"
- # 简介
- book_info = response.xpath('//div/div/div[@class="book-summary"][1]/div/div/text()').get() or "该项为空"
- # 封面图
- cover_img_url = response.xpath('//div/div/div[@class="book-pic"]/div/img/@src').get() or "该项为空"
- # 书籍详细页地址
- book_url = response.url
- # 书籍id
- book_id = book_url.split("/")[-2]
-
- item['book_id'] = book_id
- item['book_name'] = book_name
- item['book_author'] = book_author
- item['book_tag'] = book_tag
- item['book_isbn'] = book_isbn
- item['book_price'] = book_price[1:]
- item['book_info'] = book_info.strip()
- item['cover_img_url'] = "暂无封面图" if "n200.png" in cover_img_url else cover_img_url
- item['book_url'] = book_url
- yield item
-
其他设置内容参考:https://www.cdsy.xyz/computer/programme/Python/241210/cd65000.html