一、Scrapy依赖包安装
二、创建Scrapy项目
三、items.py
四、piplines.py
五、daomu.py
六、settings.py
七、run.py[在项目文件下创建]
- pip install scrapy
-
- scrapy startproject ScrapyDaomu
-
- scrapy genspider daomu "daomubiji.com"
-
项目结构
- import scrapy
-
- class ScrapydaomuItem(scrapy.Item):
- # define the fields for your item here like:
- # name = scrapy.Field()
- # items 需要处理的数据:文件名、路径
- # 文件名:小标题名称 son_title: 七星鲁王 第一章 血尸
- son_title = scrapy.Field()
- directory = scrapy.Field()
- content = scrapy.Field()
-
- class ScrapydaomuPipeline:
- def process_item(self, item, spider):
- # filename: ./novel/盗墓笔记1:七星鲁王宫/七星鲁王_第一章_血尸.txt
- filename = '{}{}.txt'.format(item['directory'], item['son_title'].replace(' ', '_'))
- with open(filename, 'w') as f:
- f.write(item['content'])
-
- return item
-
- import scrapy
- from ..items import ScrapydaomuItem
- import os
-
- class DaomuSpider(scrapy.Spider):
- name = 'daomu'
- allowed_domains = ['www.daomubiji.com']
- start_urls = ['http://www.daomubiji.com/']
-
- def parse(self, response):
- """
- 一级页面解析函数:提取标题、链接,并把大连接交给调度器入队列
- """
- a_list = response.xpath('//li[contains(@id,"menu-item-20")]/a')
- for a in a_list:
- item = ScrapydaomuItem() # 实例化item列表
- parent_title = a.xpath('./text()').get()
- parent_url = a.xpath('./@href').get()
- item['directory'] = './novel/{}/'.format(parent_title)
- # 创建对应文件夹
- if not os.path.exists(item['directory']):
- os.makedirs(item['directory'])
- # 交给调度器入队列
- yield scrapy.Request(url=parent_url, meta={'meta_1': item}, callback=self.detail_page)
-
- # 返回11个response,调用该函数
- def detail_page(self, response):
- """
- 二级页面解析函数:提取小标题、小链接
- """
- # 接收item
- meta_1 = response.meta['meta_1']
- art_list = response.xpath('//article')
- for art in art_list:
- # 只要有继续交往调度器的请求,就必须新建item对象
- item = ScrapydaomuItem()
- item['son_title'] = art.xpath('./a/text()').get()
- son_url = art.xpath('./a/@href').get()
- item['directory'] = meta_1['directory']
- # 再次交给调度器入队列
- yield scrapy.Request(url=son_url, meta={'item': item}, callback=self.get_content)
-
- # 盗墓笔记1: 传过来了75个response
- # 盗墓笔记2: 传过来了 n 个response
- # ....
-
- def get_content(self, response):
- """三级页面解析函数:提取具体小说内容"""
- item = response.meta['item']
- # content_list: ['段落1','段落2','段落3',...]
- content_list = response.xpath('//article[@class="article-content"]/p/text()').extract()
- item['content'] = '\n'.join(content_list)
-
- # 至此,一条item数据全部提取完成
- yield item
-
- BOT_NAME = 'ScrapyDaomu'
-
- SPIDER_MODULES = ['ScrapyDaomu.spiders']
- NEWSPIDER_MODULE = 'ScrapyDaomu.spiders'
-
- ROBOTSTXT_OBEY = False
-
- DEFAULT_REQUEST_HEADERS = {
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language': 'en',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
- }
-
- ITEM_PIPELINES = {
- 'ScrapyDaomu.pipelines.ScrapydaomuPipeline': 300,
- }
-
- from scrapy import cmdline
-
- cmdline.execute('scrapy crawl daomu'.split())
-