2025年3月14日 星期五 甲辰(龙)年 月十三 设为首页 加入收藏
rss
您当前的位置:首页 > 计算机 > 编程开发 > Python

利用scrapy抓取读书网站关于多级分类书的信息,通过pipeline保存到MongoDB、Redis、MySQL数据库和本地表格

时间:12-10来源:作者:点击数:24

1、创建Scrapy项目 

  • scrapy startproject Dushu

2.进入项目目录,使用命令genspider创建Spider 

  • scrapy genspider dushu dushu.com

3、定义要抓取的数据(处理items.py文件)

  • import scrapy
  • class DushuItem(scrapy.Item):
  • # 书籍ID
  • book_id = scrapy.Field()
  • # 书的链接地址
  • book_url = scrapy.Field()
  • # 书名
  • book_name = scrapy.Field()
  • # 作者
  • book_author = scrapy.Field()
  • # 简介
  • book_info = scrapy.Field()
  • # 封面图
  • cover_img_url = scrapy.Field()
  • # 价格
  • book_price = scrapy.Field()
  • # 标签
  • book_tag = scrapy.Field()
  • # ISBN,有空获取不到值的情况
  • book_isbn = scrapy.Field()
  • # 一级分类名
  • firstTitle= scrapy.Field()
  • # 二级分类名
  • secondTitle = scrapy.Field()
  • # 三级分类名
  • threeTitle = scrapy.Field()
  • # 四级分类名
  • fourTitle = scrapy.Field()

4、编写提取item数据的Spider(在spiders文件夹下:dushu.py)

  • # -*- coding: utf-8 -*-
  • # 通过一级分类页面,获取所有当前分类下的书籍,获取书籍信息后,通过面包屑导航栏来确定该书籍的四级分类
  • import scrapy
  • from ..items import DushuItem
  • class DushuSpider(scrapy.Spider):
  • name = 'dushu'
  • allowed_domains = ['dushu.com']
  • start_urls = ['https://www.dushu.com/book/']
  • base_url = "https://www.dushu.com"
  • def parse(self, response):
  • # 获取所有需要爬取的一级分类链接地址
  • firstUrls = response.xpath('//div/div[@class="row"]/div/div/dl/dt/a/@href').extract()
  • for url in firstUrls:
  • yield scrapy.Request(url=self.base_url+url,callback=self.parse_second)
  • def parse_second(self,response):
  • # 一级分类的下一页
  • next_pages = response.xpath('//div/div[@class="pages"]/a[not(@class="disabled")]/@href').extract()
  • for page in next_pages:
  • yield scrapy.Request(url=self.base_url+page,callback=self.parse_book)
  • def parse_book(self, response):
  • title = response.xpath('//div/div[@class="row"]/div/div/dl[@class="active"]/dt/text()').get()
  • next_url = response.url
  • next_url_page = next_url.split('_')[-1].split('.')[0]
  • print("准备处理[%s]第%s页" % (title, next_url_page))
  • # 书籍url
  • all_books_url = response.xpath('//div[@class="bookslist"]/ul/li/div/h3/a/@href').extract()
  • for url in all_books_url:
  • yield scrapy.Request(url=self.base_url+url,callback=self.book_detail)
  • def book_detail(self,response):
  • item = DushuItem()
  • # 书名
  • book_name = response.xpath('//div/div/div[@class="book-title"]/h1/text()').get() or "该项为空"
  • # with open(book_name+".html","w+",encoding="utf-8")as f:
  • # f.write(response.text)
  • # 获取书籍面包屑
  • navbar = response.xpath('//div[@class="crumbs"]/a[position()>2]/text()').extract()
  • length = len(navbar)
  • if length == 2:
  • print("该书籍一级分类:%s"%book_name)
  • item['firstTitle'] = navbar[0]
  • item['secondTitle'] = "-"
  • item['threeTitle'] = "-"
  • item['fourTitle'] = "-"
  • elif length == 3:
  • print("该书籍二级分类:%s"%book_name)
  • item['firstTitle'] = navbar[0]
  • item['secondTitle'] = navbar[1]
  • item['threeTitle'] = "-"
  • item['fourTitle'] = "-"
  • elif length == 4:
  • print("该书籍三级分类%s"%book_name)
  • item['firstTitle'] = navbar[0]
  • item['secondTitle'] = navbar[1]
  • item['threeTitle'] = navbar[2]
  • item['fourTitle'] = "-"
  • elif length == 5:
  • print("该书籍四级分类:%s"%book_name)
  • item['firstTitle'] = navbar[0]
  • item['secondTitle'] = navbar[1]
  • item['threeTitle'] = navbar[2]
  • item['fourTitle'] = navbar[3]
  • else:
  • print("该书籍分类异常:%s"%book_name)
  • item['firstTitle'] = "-"
  • item['secondTitle'] = "-"
  • item['threeTitle'] = "-"
  • item['fourTitle'] = "-"
  • # 作者
  • book_author = response.xpath('//div/div/div[@class="book-details"]/div/table//tr[1]/td[2]/text()').get()or "该项为空"
  • # 标签
  • book_tag = response.xpath('//div/div/div[@class="book-details"]/div/table//tr[4]/td[2]/text()').get()or "该项为空"
  • # ISBN
  • book_isbn = response.xpath('//div/div/div[@class="book-details"]/table//tr[1]/td[2]/text()').get() or "该项为空"
  • # 价格
  • book_price = response.xpath('//div/div/div[@class="book-details"]/div/p/span/text()').get()or "该项为空"
  • # 简介
  • book_info = response.xpath('//div/div/div[@class="book-summary"][1]/div/div/text()').get()or "该项为空"
  • # 封面图
  • cover_img_url = response.xpath('//div/div/div[@class="book-pic"]/div/img/@src').get()or "该项为空"
  • # 书籍详细页地址
  • book_url = response.url
  • # 书籍id
  • book_id = book_url.split("/")[-2]
  • item['book_id'] = book_id
  • item['book_name'] = book_name
  • item['book_author'] = book_author
  • item['book_tag'] = book_tag
  • item['book_isbn'] = book_isbn
  • item['book_price'] = book_price[1:]
  • item['book_info'] = book_info.strip()
  • item['cover_img_url'] = "暂无封面图" if "n200.png" in cover_img_url else cover_img_url
  • item['book_url'] = book_url
  • yield item

5.处理pipelines管道文件保存数据,可将结果保存到文件中(pipelines.py)

  • # -*- coding: utf-8 -*-
  • import os
  • import time
  • import json
  • import scrapy
  • import pymysql
  • import pymongo
  • import redis
  • from openpyxl import Workbook
  • from scrapy.pipelines.images import ImagesPipeline
  • from scrapy.utils.project import get_project_settings
  • from scrapy import Item
  • from scrapy.exceptions import DropItem
  • class MyEncoder(json.JSONEncoder):
  • def default(self, o):
  • if isinstance(o, bytes):
  • return str(o, encoding='utf-8')
  • return json.JSONEncoder.default(self, o)
  • class ImagePipeline(ImagesPipeline):
  • # 获取存放图片文件夹名字
  • IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
  • # 图片名字不能含以下特殊符号
  • char_list = ['*', '|', ':', '?', '/', '<', '>', '"', '\\']
  • def get_media_requests(self, item, info):
  • # 下载封面图的时候一定要Referer书的url地址,否则403
  • headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0",
  • "Referer": item['book_url']}
  • # 无封面图的不下载
  • cover_img_url = item['cover_img_url']
  • if cover_img_url.startswith('http'):
  • yield scrapy.Request(cover_img_url,headers=headers)
  • def item_completed(self, results, item, info):
  • book_name = item['book_name']
  • if (not os.path.exists(self.IMAGES_STORE)):
  • os.makedirs(self.IMAGES_STORE)
  • # results:[(True, {'url': 'https://a.dushu.com/img/n200.png', 'path': 'full/783a2.jpg', 'checksum': '2792e5'})]
  • if results:
  • print("封面图下载:%s"% book_name)
  • image_path = [x['path'] for ok, x in results if ok]
  • for i in self.char_list:
  • if i in book_name:
  • print("'%s'包含特殊符号'%s'已转义!" % (book_name, i))
  • book_name = book_name.replace(i, "_")
  • os.rename(self.IMAGES_STORE + '/' + image_path[0], self.IMAGES_STORE + '/' + book_name + ".jpg")
  • else:
  • print("无封面图,无需下载:%s"% book_name)
  • return item
  • def close_spider(self, spider):
  • print("图片下载完成!")
  • # full最后如果是空文件夹删掉
  • path = self.IMAGES_STORE + '/full'
  • if not os.listdir(path):
  • os.rmdir(path)
  • if not os.listdir(self.IMAGES_STORE):
  • os.rmdir(path)
  • class XlsxPipeline(object):
  • def __init__(self):
  • self.wb = Workbook()
  • self.ws = self.wb.active
  • self.ws.title = "dushu网图书信息表"
  • self.ws.append(['book_id','一级分类','二级分类','三级分类', '四级分类', '书名',
  • '作者', '标签', 'ISBN', '价格(元)', '简介',
  • '封面图', '书的链接地址'])
  • def process_item(self, item, spider):
  • text = [item['book_id'],item['firstTitle'],item['secondTitle'],item['threeTitle'],item['fourTitle'],item['book_name'],
  • item['book_author'],item['book_tag'],item['book_isbn'],item['book_price'],item['book_info'],
  • item['cover_img_url'],item['book_url']]
  • self.ws.append(text)
  • return item
  • def close_spider(self,spider):
  • time_file = time.strftime("%Y-%m-%d",time.localtime())
  • self.wb.save(spider.name + time_file + ".xlsx")
  • print("表格数据处理完毕,谢谢使用!")
  • class MysqlPipeline():
  • @classmethod
  • def from_crawler(cls,crawler):
  • cls.MYSQL_HOST = crawler.settings.get('MYSQL_HOST')
  • cls.MYSQL_PORT = crawler.settings.get('MYSQL_PORT')
  • cls.MYSQL_USER = crawler.settings.get('MYSQL_USER')
  • cls.MYSQL_PASSWD = crawler.settings.get('MYSQL_PASSWD')
  • cls.MYSQL_DBNAME = crawler.settings.get('MYSQL_DBNAME')
  • cls.MYSQL_CHARSET = crawler.settings.get('MYSQL_CHARSET')
  • return cls()
  • def open_spider(self,spider):
  • self.db = pymysql.connect(host=self.MYSQL_HOST,port=self.MYSQL_PORT,user=self.MYSQL_USER,
  • passwd=self.MYSQL_PASSWD,db=self.MYSQL_DBNAME,charset=self.MYSQL_CHARSET)
  • self.cursor = self.db.cursor()
  • def process_item(self, item, spider):
  • try:
  • sql = 'CREATE TABLE IF NOT EXISTS dushu(book_id BIGINT PRIMARY KEY NOT NULL ,firstTitle VARCHAR(15),' \
  • 'secondTitle VARCHAR(20),threeTitle VARCHAR(20),fourTitle VARCHAR(20),book_name VARCHAR(200) NOT NULL ,' \
  • 'book_author VARCHAR(200),book_tag VARCHAR(100),book_isbn VARCHAR(50),book_price VARCHAR(20),book_info TEXT,' \
  • 'cover_img_url VARCHAR(200),book_url VARCHAR(200))ENGINE=InnoDB DEFAULT CHARSET="utf8mb4";'
  • self.cursor.execute(sql)
  • except:
  • pass
  • try:
  • self.cursor.execute("SELECT book_id FROM dushu WHERE book_id = %s;" ,item['book_id'])
  • switch = self.cursor.fetchone()
  • keys,values = zip(*item.items())
  • if switch:
  • sql = """INSERT INTO dushu({})VALUES ({})ON DUPLICATE KEY UPDATE {};""".format(
  • ','.join(keys),
  • ','.join(['%s']*len(values)),
  • ','.join(['{}=%s'.format(k)for k in keys])
  • )
  • self.cursor.execute(sql, values * 2)
  • else:
  • sql = """INSERT INTO dushu({})VALUES ({});""".format(
  • ','.join(keys),
  • ','.join(['%s'] * len(values))
  • )
  • self.cursor.execute(sql,values)
  • self.db.commit()
  • return item
  • except Exception as e:
  • print("出错了:",e)
  • self.db.rollback()
  • def close_spider(self,spider):
  • print("mysql数据库处理完毕")
  • self.cursor.close()
  • self.db.close()
  • class MongoPipeline():
  • @classmethod
  • def from_crawler(cls, crawler):
  • cls.MONGO_HOST = crawler.settings.get('MONGO_HOST')
  • cls.MONGO_PORT = crawler.settings.get('MONGO_PORT')
  • cls.MONGO_DB = crawler.settings.get('MONGO_DB')
  • return cls()
  • def open_spider(self,spider):
  • self.client = pymongo.MongoClient(self.MONGO_HOST,self.MONGO_PORT)
  • # 记录插入了几条数据
  • self.num = 0
  • def process_item(self,item,spider):
  • try:
  • self.db = self.client[self.MONGO_DB]
  • self.book = self.db[spider.name]
  • # 先获取数据库中有没有数据,如果有要去判断该值是否已经存在
  • count = self.book.find().count()
  • # 集合对象的insert方法需传入一个字典对象(不能传入Item对象)
  • data = dict(item) if isinstance(item,Item)else item
  • if count == 0:
  • print("MongoDB数据库无数据,直接插入数据!")
  • self.book.insert(data)
  • self.num += 1
  • else:
  • book_name = item['book_name']
  • count = self.book.find({'book_name':book_name}).count()
  • if count == 0:
  • print("%s:添加数据库中..."%book_name)
  • self.book.insert(data)
  • self.num += 1
  • else:
  • print("%s:该数据已存在无需添加!"%book_name)
  • return item
  • except Exception as e:
  • print("MongoDB数据库出错:",e)
  • def close_spider(self,spider):
  • print("已完成,本次共保存到MongoDB数据库:%s条数据!" % self.num)
  • self.client.close()
  • class RedisPipeline():
  • @classmethod
  • def from_crawler(cls,crawler):
  • # cls.REDIS_HOST = crawler.settings.get('REDIS_HOST')
  • cls.REDIS_HOST = get_project_settings().get('REDIS_HOST')
  • cls.REDIS_PORT = crawler.settings.get('REDIS_PORT')
  • cls.REDIS_DBNAME = crawler.settings.get('REDIS_DBNAME')
  • cls.REDIS_decode_responses = crawler.settings.get('REDIS_decode_responses')
  • return cls()
  • def open_spider(self,spider):
  • try:
  • self.redis_client = redis.StrictRedis(self.REDIS_HOST,self.REDIS_PORT,self.REDIS_DBNAME,
  • decode_responses=self.REDIS_decode_responses)
  • except Exception as e:
  • print("redis数据库出错:",e)
  • def process_item(self,item,spider):
  • if self.redis_client.sadd('dushu:items',item['book_name']):
  • return item
  • raise DropItem
  • def close_spider(self,spider):
  • print("redis处理完成!")

6.配置settings文件(settings.py)

  • LOG_FILE = "dushu.log"
  • LOG_LEVEL = "DEBUG"
  • LOG_STDOUT = True
  • IMAGES_STORE = './images'
  • # 配置MYSQL
  • MYSQL_HOST = "localhost"
  • MYSQL_PORT = 3306
  • MYSQL_USER = "root"
  • MYSQL_PASSWD = "123456"
  • MYSQL_DBNAME = "python5"
  • MYSQL_CHARSET = "utf8mb4"
  • # 配置mongodb
  • MONGO_HOST = 'localhost'
  • MONGO_PORT = 27017
  • MONGO_DB = 'py4'
  • # 配置redis
  • REDIS_HOST = 'localhost'
  • REDIS_PORT = 6379
  • REDIS_DBNAME = 4
  • REDIS_decode_responses = True
  • ROBOTSTXT_OBEY = False
  • DOWNLOAD_DELAY = 3
  • DEFAULT_REQUEST_HEADERS = {
  • 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);'
  • }
  • # 有针对性的选择管道文件
  • ITEM_PIPELINES = {
  • 'Dushu.pipelines.ImagePipeline': 3,
  • 'Dushu.pipelines.RedisPipeline': 300,
  • 'Dushu.pipelines.XlsxPipeline': 301,
  • 'Dushu.pipelines.MysqlPipeline': 302,
  • 'Dushu.pipelines.MongoPipeline': 303,
  • }

7.记得提前打开mysq/redis/MongoDBl数据库,并且建立好相应的表

  • CREATE TABLE IF NOT EXISTS dushu(book_id BIGINT PRIMARY KEY NOT NULL ,
  • book_name VARCHAR(200) NOT NULL ,book_author VARCHAR(200),
  • book_tag VARCHAR(100),book_isbn VARCHAR(50),
  • book_price VARCHAR(20),book_info TEXT,
  • cover_img_url VARCHAR(200),
  • book_url VARCHAR(200))ENGINE=InnoDB DEFAULT CHARSET="utf8mb4";

8.以上设置完毕,进行爬取:执行项目命令crawl,启动Spider:

  • scrapy crawl dushu

 

方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门
本栏推荐