2025年3月14日 星期五 甲辰(龙)年 月十三 设为首页 加入收藏
rss
您当前的位置:首页 > 计算机 > 编程开发 > Python

scrapy获取起点完本书籍信息存入表格和MYSQL数据库,并用ImagePipeline下载封面图

时间:12-11来源:作者:点击数:24
CDSY,CDSY.XYZ

1、创建Scrapy项目

  • scrapy startproject qidian

2.进入项目目录,使用命令genspider创建Spider

  • scrapy genspider wanben qidian.com

3、定义要抓取的数据(处理items.py文件)

  • import scrapy
  • class QidianItem(scrapy.Item):
  • # define the fields for your item here like:
  • # 书名
  • book_name = scrapy.Field()
  • # 所在页码
  • num = scrapy.Field()
  • # 书的id
  • book_id = scrapy.Field()
  • # 作者
  • author = scrapy.Field()
  • # 简介
  • intro = scrapy.Field()
  • # 书的链接地址
  • book_url = scrapy.Field()
  • # 封面图链接
  • cover_img_url = scrapy.Field()

4、编写提取item数据的Spider(在spiders文件夹下:wanben.py)

  • # -*- coding: utf-8 -*-
  • # 获取起点完本书籍信息
  • import scrapy
  • from ..items import QidianItem
  • class WanbenSpider(scrapy.Spider):
  • name = 'wanben'
  • allowed_domains = ['qidian.com']
  • start_urls = ['https://www.qidian.com/finish']
  • def parse(self, response):
  • next_page = response.xpath('//*[@id="page-container"]/div/ul/li[*]/a[contains(@class,"lbf-pagination-page")]/@href').extract()
  • for page in next_page:
  • print("处理第%s页" %(page[-1]))
  • yield scrapy.Request(url="https:"+page,callback=self.parse_book)
  • def parse_book(self,response):
  • # 书籍在第几页
  • num = (response.url).split("=")[-1]
  • book_name = response.xpath('//div[@class="book-mid-info"]/h4/a/text()').extract()
  • book_id = response.xpath('//div[@class="book-mid-info"]/h4/a/@data-bid').extract()
  • author = response.xpath('//div[@class="book-mid-info"]/p[@class="author"]/a[1]/text()').extract()
  • intro = response.xpath('//div[@class="book-mid-info"]/p[@class="intro"]/text()').extract()
  • # 书的链接地址要加上"https:"
  • book_url = response.xpath('//div[@class="book-mid-info"]/h4/a/@href').extract()
  • # 封面图地址要加上"https:",并且去掉结尾的数字150,获得大图
  • cover_img_url = response.xpath('//div[@class="book-img-box"]/a/img/@src').extract()
  • length = len(book_name)
  • for i in range(length):
  • item = QidianItem()
  • item['num'] = int(num)
  • item['book_id'] = book_id[i]
  • item['book_name'] = book_name[i]
  • item['author'] = author[i]
  • item['intro'] = intro[i].strip()
  • item['book_url'] = "https:" + book_url[i]
  • item['cover_img_url'] = "https:" + cover_img_url[i][:-3]
  • yield item

5.处理pipelines管道文件保存数据,下载封面图及将结果保存到表格和数据库中(pipelines.py)

  • import os
  • import time
  • import scrapy
  • import pymysql
  • from openpyxl import Workbook
  • from scrapy.pipelines.images import ImagesPipeline
  • from scrapy.utils.project import get_project_settings
  • # 保存封面图到本地images文件夹
  • class ImagePipeline(ImagesPipeline):
  • # 获取settings文件中设置的图片保存地址IMAGES_STORE: ./images
  • IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
  • def get_media_requests(self, item, info):
  • yield scrapy.Request(item['cover_img_url'])
  • def item_completed(self, results, item, info):
  • print("处理[%s]封面图片" % item['book_name'])
  • # 图片目录
  • if (not os.path.exists(self.IMAGES_STORE)):
  • os.makedirs(self.IMAGES_STORE)
  • # ['full/7237c3717f9d3eef185e2d6bad9903e2c6eef810.jpg']
  • image_path = [x['path'] for ok, x in results if ok]
  • # 图片更名和更换到imags文件夹下,默认是在full文件夹中
  • os.rename(self.IMAGES_STORE + '/' + image_path[0],self.IMAGES_STORE + '/' + item['book_name'] + '.jpg')
  • return item
  • def close_spider(self,spider):
  • print("图片下载完成")
  • # full最后如果是空文件夹删掉
  • path = self.IMAGES_STORE + '/full'
  • if not os.listdir(path):
  • os.rmdir(path)
  • class MySqlPipeline():
  • @classmethod
  • def from_crawler(cls,crawler):
  • cls.MYSQL_HOST = crawler.settings.get('MYSQL_HOST')
  • cls.MYSQL_PORT = crawler.settings.get('MYSQL_PORT')
  • cls.MYSQL_USER = crawler.settings.get('MYSQL_USER')
  • cls.MYSQL_PASSWD = crawler.settings.get('MYSQL_PASSWD')
  • cls.MYSQL_DBNAME = crawler.settings.get('MYSQL_DBNAME')
  • cls.MYSQL_CHARSET = crawler.settings.get('MYSQL_CHARSET')
  • return cls()
  • def __init__(self):
  • self.db = pymysql.connect(host=self.MYSQL_HOST,port=self.MYSQL_PORT,user=self.MYSQL_USER,passwd=self.MYSQL_PASSWD,
  • db=self.MYSQL_DBNAME,charset=self.MYSQL_CHARSET)
  • self.currsor = self.db.cursor()
  • def process_item(self,item,spider):
  • try:
  • sql = 'CREATE TABLE IF NOT EXISTS qidian(book_id BIGINT PRIMARY KEY NOT NULL,book_name VARCHAR(100) NOT NULL,num INT(5) COMMENT "所在页码",' \
  • 'author VARCHAR(100),intro TEXT COMMENT "简介",book_url VARCHAR(200),' \
  • 'cover_img_url VARCHAR(200))ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;'
  • self.currsor.execute(sql)
  • except:
  • pass
  • try:
  • self.currsor.execute('SELECT book_id FROM qidian WHERE book_id=%s;',item['book_id'])
  • switch = self.currsor.fetchone()
  • keys,values = zip(*item.items())
  • if switch:
  • sql = """INSERT INTO qidian({})VALUES ({})ON DUPLICATE KEY UPDATE {};""".format(
  • ','.join(keys),
  • ','.join(['%s']* len(values)),
  • ','.join(['{}=%s'.format(k)for k in keys])
  • )
  • self.currsor.execute(sql,values*2)
  • else:
  • sql = 'INSERT INTO qidian({})VALUES ({});'.format(
  • ','.join(keys),
  • ','.join(['%s'] * len(values))
  • )
  • self.currsor.execute(sql,values)
  • self.db.commit()
  • return item
  • except Exception as e:
  • print("出错:",e)
  • self.db.rollback()
  • def close_spider(self,spider):
  • print("数据库处理完毕!")
  • self.currsor.close()
  • self.db.close()
  • class XlsxPipeline(object):
  • def __init__(self):
  • self.wb = Workbook()
  • self.ws = self.wb.active
  • self.ws.title = "qidian完结书表"
  • # 创建表头
  • self.ws.append(['book_id','所在页码','书名','作者','简介','链接地址','封面图链接地址'])
  • def process_item(self, item, spider):
  • text = [item['book_id'],item['num'],item['book_name'],item['author'],
  • item['intro'],item['book_url'],item['cover_img_url']]
  • self.ws.append(text)
  • return item
  • def close_spider(self,spider):
  • file_date = time.strftime("%Y-%m-%d",time.localtime())
  • self.wb.save(spider.name + file_date + '.xlsx')
  • print("表格已保存")

6.配置settings文件(settings.py)

  • # 将日志存到本地文件中(可选添加设置)
  • LOG_FILE = "qidian.log"
  • LOG_LEVEL = "DEBUG"
  • # 包含打印信息也一起写进日志里
  • LOG_STDOUT = True
  • # 保存封面图片目录地址
  • IMAGES_STORE = './images'
  • # 配置MYSQL
  • MYSQL_HOST = "localhost"
  • MYSQL_PORT = 3306
  • MYSQL_USER = "root"
  • MYSQL_PASSWD = "123456"
  • MYSQL_DBNAME = "python5"
  • MYSQL_CHARSET = "utf8mb4"
  • ROBOTSTXT_OBEY = False
  • DOWNLOAD_DELAY = 3
  • DEFAULT_REQUEST_HEADERS = {
  • 'User-Agesettingsnt': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',
  • # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  • # 'Accept-Language': 'en',
  • }
  • ITEM_PIPELINES = {
  • 'qidian.pipelines.XlsxPipeline': 300,
  • 'qidian.pipelines.ImagePipeline': 3,
  • 'qidian.pipelines.MySqlPipeline': 301,
  • }

7.以上设置完毕,进行爬取:执行项目命令crawl,启动Spider:

  • scrapy crawl wanben

 

CDSY,CDSY.XYZ
方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门
本栏推荐