1、创建Scrapy项目
- scrapy startproject qidian
2.进入项目目录,使用命令genspider创建Spider
- scrapy genspider wanben qidian.com
3、定义要抓取的数据(处理items.py文件)
- import scrapy
-
- class QidianItem(scrapy.Item):
- # define the fields for your item here like:
- # 书名
- book_name = scrapy.Field()
- # 所在页码
- num = scrapy.Field()
- # 书的id
- book_id = scrapy.Field()
- # 作者
- author = scrapy.Field()
- # 简介
- intro = scrapy.Field()
- # 书的链接地址
- book_url = scrapy.Field()
- # 封面图链接
- cover_img_url = scrapy.Field()
4、编写提取item数据的Spider(在spiders文件夹下:wanben.py)
- # -*- coding: utf-8 -*-
- # 获取起点完本书籍信息
- import scrapy
- from ..items import QidianItem
-
- class WanbenSpider(scrapy.Spider):
- name = 'wanben'
- allowed_domains = ['qidian.com']
- start_urls = ['https://www.qidian.com/finish']
-
- def parse(self, response):
- next_page = response.xpath('//*[@id="page-container"]/div/ul/li[*]/a[contains(@class,"lbf-pagination-page")]/@href').extract()
- for page in next_page:
- print("处理第%s页" %(page[-1]))
- yield scrapy.Request(url="https:"+page,callback=self.parse_book)
-
- def parse_book(self,response):
- # 书籍在第几页
- num = (response.url).split("=")[-1]
- book_name = response.xpath('//div[@class="book-mid-info"]/h4/a/text()').extract()
- book_id = response.xpath('//div[@class="book-mid-info"]/h4/a/@data-bid').extract()
- author = response.xpath('//div[@class="book-mid-info"]/p[@class="author"]/a[1]/text()').extract()
- intro = response.xpath('//div[@class="book-mid-info"]/p[@class="intro"]/text()').extract()
- # 书的链接地址要加上"https:"
- book_url = response.xpath('//div[@class="book-mid-info"]/h4/a/@href').extract()
- # 封面图地址要加上"https:",并且去掉结尾的数字150,获得大图
- cover_img_url = response.xpath('//div[@class="book-img-box"]/a/img/@src').extract()
- length = len(book_name)
- for i in range(length):
- item = QidianItem()
- item['num'] = int(num)
- item['book_id'] = book_id[i]
- item['book_name'] = book_name[i]
- item['author'] = author[i]
- item['intro'] = intro[i].strip()
- item['book_url'] = "https:" + book_url[i]
- item['cover_img_url'] = "https:" + cover_img_url[i][:-3]
- yield item
5.处理pipelines管道文件保存数据,下载封面图及将结果保存到表格和数据库中(pipelines.py)
- import os
- import time
-
- import scrapy
- import pymysql
- from openpyxl import Workbook
- from scrapy.pipelines.images import ImagesPipeline
- from scrapy.utils.project import get_project_settings
-
- # 保存封面图到本地images文件夹
- class ImagePipeline(ImagesPipeline):
- # 获取settings文件中设置的图片保存地址IMAGES_STORE: ./images
- IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
- def get_media_requests(self, item, info):
- yield scrapy.Request(item['cover_img_url'])
- def item_completed(self, results, item, info):
- print("处理[%s]封面图片" % item['book_name'])
- # 图片目录
- if (not os.path.exists(self.IMAGES_STORE)):
- os.makedirs(self.IMAGES_STORE)
- # ['full/7237c3717f9d3eef185e2d6bad9903e2c6eef810.jpg']
- image_path = [x['path'] for ok, x in results if ok]
- # 图片更名和更换到imags文件夹下,默认是在full文件夹中
- os.rename(self.IMAGES_STORE + '/' + image_path[0],self.IMAGES_STORE + '/' + item['book_name'] + '.jpg')
- return item
- def close_spider(self,spider):
- print("图片下载完成")
- # full最后如果是空文件夹删掉
- path = self.IMAGES_STORE + '/full'
- if not os.listdir(path):
- os.rmdir(path)
-
- class MySqlPipeline():
- @classmethod
- def from_crawler(cls,crawler):
- cls.MYSQL_HOST = crawler.settings.get('MYSQL_HOST')
- cls.MYSQL_PORT = crawler.settings.get('MYSQL_PORT')
- cls.MYSQL_USER = crawler.settings.get('MYSQL_USER')
- cls.MYSQL_PASSWD = crawler.settings.get('MYSQL_PASSWD')
- cls.MYSQL_DBNAME = crawler.settings.get('MYSQL_DBNAME')
- cls.MYSQL_CHARSET = crawler.settings.get('MYSQL_CHARSET')
- return cls()
- def __init__(self):
- self.db = pymysql.connect(host=self.MYSQL_HOST,port=self.MYSQL_PORT,user=self.MYSQL_USER,passwd=self.MYSQL_PASSWD,
- db=self.MYSQL_DBNAME,charset=self.MYSQL_CHARSET)
- self.currsor = self.db.cursor()
-
- def process_item(self,item,spider):
- try:
- sql = 'CREATE TABLE IF NOT EXISTS qidian(book_id BIGINT PRIMARY KEY NOT NULL,book_name VARCHAR(100) NOT NULL,num INT(5) COMMENT "所在页码",' \
- 'author VARCHAR(100),intro TEXT COMMENT "简介",book_url VARCHAR(200),' \
- 'cover_img_url VARCHAR(200))ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;'
- self.currsor.execute(sql)
- except:
- pass
- try:
- self.currsor.execute('SELECT book_id FROM qidian WHERE book_id=%s;',item['book_id'])
- switch = self.currsor.fetchone()
- keys,values = zip(*item.items())
- if switch:
- sql = """INSERT INTO qidian({})VALUES ({})ON DUPLICATE KEY UPDATE {};""".format(
- ','.join(keys),
- ','.join(['%s']* len(values)),
- ','.join(['{}=%s'.format(k)for k in keys])
- )
- self.currsor.execute(sql,values*2)
- else:
- sql = 'INSERT INTO qidian({})VALUES ({});'.format(
- ','.join(keys),
- ','.join(['%s'] * len(values))
- )
- self.currsor.execute(sql,values)
- self.db.commit()
- return item
- except Exception as e:
- print("出错:",e)
- self.db.rollback()
-
- def close_spider(self,spider):
- print("数据库处理完毕!")
- self.currsor.close()
- self.db.close()
-
- class XlsxPipeline(object):
- def __init__(self):
- self.wb = Workbook()
- self.ws = self.wb.active
- self.ws.title = "qidian完结书表"
- # 创建表头
- self.ws.append(['book_id','所在页码','书名','作者','简介','链接地址','封面图链接地址'])
-
- def process_item(self, item, spider):
- text = [item['book_id'],item['num'],item['book_name'],item['author'],
- item['intro'],item['book_url'],item['cover_img_url']]
- self.ws.append(text)
- return item
- def close_spider(self,spider):
- file_date = time.strftime("%Y-%m-%d",time.localtime())
- self.wb.save(spider.name + file_date + '.xlsx')
- print("表格已保存")
6.配置settings文件(settings.py)
-
- # 将日志存到本地文件中(可选添加设置)
- LOG_FILE = "qidian.log"
- LOG_LEVEL = "DEBUG"
- # 包含打印信息也一起写进日志里
- LOG_STDOUT = True
-
- # 保存封面图片目录地址
- IMAGES_STORE = './images'
-
- # 配置MYSQL
- MYSQL_HOST = "localhost"
- MYSQL_PORT = 3306
- MYSQL_USER = "root"
- MYSQL_PASSWD = "123456"
- MYSQL_DBNAME = "python5"
- MYSQL_CHARSET = "utf8mb4"
-
- ROBOTSTXT_OBEY = False
- DOWNLOAD_DELAY = 3
-
- DEFAULT_REQUEST_HEADERS = {
- 'User-Agesettingsnt': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',
- # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- # 'Accept-Language': 'en',
- }
-
- ITEM_PIPELINES = {
- 'qidian.pipelines.XlsxPipeline': 300,
- 'qidian.pipelines.ImagePipeline': 3,
- 'qidian.pipelines.MySqlPipeline': 301,
- }
-
7.以上设置完毕,进行爬取:执行项目命令crawl,启动Spider:
- scrapy crawl wanben