如果分页中有固定的分页规则,则上篇文章中讲过,如果url不规则,则需要处理
准备
url:https://www.biquge5200.cc/xuanhuanxiaoshuo/
根据每个小说的名字作为文件名,保存为txt
新建爬虫项目
scrapy startproject biquge
cd biquge
scrapy genspider Mybiquge biquge5200.cc
创建调试文件
主项目下新建start.py文件
import scrapy.cmdline
scrapy.cmdline.execute('scrapy crawl mybiquge --nolog'.split())
修改配置(配置管道,爬取间隔时间)
BOT_NAME = 'biquge'
SPIDER_MODULES = ['biquge.spiders']
NEWSPIDER_MODULE = 'biquge.spiders'
DOWNLOAD_DELAY = 0.5 #配置爬取时间间隔
ITEM_PIPELINES = {
'biquge.pipelines.BiqugePipeline': 300,
}
items.py中
import scrapy
class BiqugeItem(scrapy.Item):
name = scrapy.Field() #小说名
zj_name = scrapy.Field() #每章的小说章节名
zj_content = scrapy.Field() #每章的内容
爬虫文件中(mybiquge.py)
import scrapy
from biquge.items import BiqugeItem
class MybiqugeSpider(scrapy.Spider):
name = 'mybiquge'
allowed_domains = ['biquge5200.cc']
start_urls = ['https://www.biquge5200.cc/xuanhuanxiaoshuo/']
# 处理首页数据
def parse(self, response):
print("*" * 100)
# print(response.text)
print("*" * 100)
xs_list = response.xpath('//div[@class="l"]/ul/li')
for xs in xs_list:
name = xs.xpath('./span[@class="s2"]/a/text()').get()
href = xs.xpath('./span[@class="s2"]/a/@href').get()
# print(name, href)
# scrapy.Request: 发送请求
# 继续爬取小说的详情(包含所有章节)
yield scrapy.Request(
url=href,
callback=self.parse_detail,
# meta: 可以将数据传入到parse_detail函数中
meta={'name': name}
)
# yield后面可以返回2种东西:
# 1. 返回item => 将item传入到pipeline
# 2. 返回Request => 将Request继续交给Scrapy引擎自动爬取并通过回调函数返回结果
# 处理小说详情(包含所有章节)
def parse_detail(self, response):
# 小说名字
name = response.meta['name']
# 所有章节
zj_list = response.xpath('//div[@id="list"]/dl/dd')
for zj in zj_list:
zj_name = zj.xpath('./a/text()').get() # 章节名字
zj_href = zj.xpath('./a/@href').get() # 章节链接
# 继续爬取每个章节的文字内容
yield scrapy.Request(
url=zj_href,
callback=self.parse_content,
meta={'name': name, 'zj_name': zj_name}
)
# 处理章节内容
def parse_content(self, response):
name = response.meta['name'] # 小说名字
zj_name = response.meta['zj_name'] # 章节名字
# 内容
# get(): 获取列表的第一个,类似extract_first
# getall(): 获取列表的所有,类似extract
zj_content = response.xpath('//div[@id="content"]/p/text()').getall()
zj_content = '\n'.join(zj_content)
# print(zj_content)
# 将小说的每个章节的内容传入到管道
item = BiqugeItem()
item['name'] = name
item['zj_name'] = zj_name
item['zj_content'] = zj_content
yield item
管道中(pipelines.py)
import os
from itemadapter import ItemAdapter
class BiqugePipeline:
def __init__(self):
self.path = r'D:\biquge\biquge\biquge_xiaoshuo'
def process_item(self, item, spider):
print(item)
name = item['name']
zj_name = item['zj_name']
zj_content = item['zj_content']
xs_path = os.path.join(self.path,name)
if not os.path.exists(xs_path):
os.mkdir(xs_path)
zj_path = os.path.join(xs_path,f'{zj_name}.txt')
with open(zj_path,'w',encoding='utf-8') as f:
f.write(zj_content)
f.flush()
print(f'{name},{zj_name} --保存成功!')
return item