如果分页中有固定的分页规则,则上篇文章中讲过,如果url不规则,则需要处理
准备
- url:https://www.biquge5200.cc/xuanhuanxiaoshuo/
- 根据每个小说的名字作为文件名,保存为txt
-
新建爬虫项目
- scrapy startproject biquge
- cd biquge
- scrapy genspider Mybiquge biquge5200.cc
-
创建调试文件
- 主项目下新建start.py文件
- import scrapy.cmdline
- scrapy.cmdline.execute('scrapy crawl mybiquge --nolog'.split())
-
修改配置(配置管道,爬取间隔时间)
- BOT_NAME = 'biquge'
-
- SPIDER_MODULES = ['biquge.spiders']
- NEWSPIDER_MODULE = 'biquge.spiders'
-
- DOWNLOAD_DELAY = 0.5 #配置爬取时间间隔
-
- ITEM_PIPELINES = {
- 'biquge.pipelines.BiqugePipeline': 300,
- }
-
items.py中
- import scrapy
-
- class BiqugeItem(scrapy.Item):
- name = scrapy.Field() #小说名
- zj_name = scrapy.Field() #每章的小说章节名
- zj_content = scrapy.Field() #每章的内容
-
爬虫文件中(mybiquge.py)
- import scrapy
- from biquge.items import BiqugeItem
-
-
- class MybiqugeSpider(scrapy.Spider):
- name = 'mybiquge'
- allowed_domains = ['biquge5200.cc']
- start_urls = ['https://www.biquge5200.cc/xuanhuanxiaoshuo/']
-
- # 处理首页数据
- def parse(self, response):
- print("*" * 100)
- # print(response.text)
- print("*" * 100)
-
- xs_list = response.xpath('//div[@class="l"]/ul/li')
- for xs in xs_list:
- name = xs.xpath('./span[@class="s2"]/a/text()').get()
- href = xs.xpath('./span[@class="s2"]/a/@href').get()
- # print(name, href)
-
- # scrapy.Request: 发送请求
- # 继续爬取小说的详情(包含所有章节)
- yield scrapy.Request(
- url=href,
- callback=self.parse_detail,
- # meta: 可以将数据传入到parse_detail函数中
- meta={'name': name}
- )
- # yield后面可以返回2种东西:
- # 1. 返回item => 将item传入到pipeline
- # 2. 返回Request => 将Request继续交给Scrapy引擎自动爬取并通过回调函数返回结果
-
- # 处理小说详情(包含所有章节)
- def parse_detail(self, response):
- # 小说名字
- name = response.meta['name']
-
- # 所有章节
- zj_list = response.xpath('//div[@id="list"]/dl/dd')
- for zj in zj_list:
- zj_name = zj.xpath('./a/text()').get() # 章节名字
- zj_href = zj.xpath('./a/@href').get() # 章节链接
-
- # 继续爬取每个章节的文字内容
- yield scrapy.Request(
- url=zj_href,
- callback=self.parse_content,
- meta={'name': name, 'zj_name': zj_name}
- )
-
- # 处理章节内容
- def parse_content(self, response):
-
- name = response.meta['name'] # 小说名字
- zj_name = response.meta['zj_name'] # 章节名字
-
- # 内容
- # get(): 获取列表的第一个,类似extract_first
- # getall(): 获取列表的所有,类似extract
- zj_content = response.xpath('//div[@id="content"]/p/text()').getall()
- zj_content = '\n'.join(zj_content)
- # print(zj_content)
-
- # 将小说的每个章节的内容传入到管道
- item = BiqugeItem()
- item['name'] = name
- item['zj_name'] = zj_name
- item['zj_content'] = zj_content
- yield item
-
管道中(pipelines.py)
- import os
- from itemadapter import ItemAdapter
-
-
- class BiqugePipeline:
- def __init__(self):
- self.path = r'D:\biquge\biquge\biquge_xiaoshuo'
-
-
- def process_item(self, item, spider):
- print(item)
- name = item['name']
- zj_name = item['zj_name']
- zj_content = item['zj_content']
- xs_path = os.path.join(self.path,name)
- if not os.path.exists(xs_path):
- os.mkdir(xs_path)
-
- zj_path = os.path.join(xs_path,f'{zj_name}.txt')
- with open(zj_path,'w',encoding='utf-8') as f:
- f.write(zj_content)
- f.flush()
- print(f'{name},{zj_name} --保存成功!')
- return item
-