1、创建Scrapy项目
scrapy startproject Dushu
2.进入项目目录,使用命令genspider创建Spider
scrapy genspider dushu dushu.com
3、定义要抓取的数据(处理items.py文件)
import scrapy
class DushuItem(scrapy.Item):
# 书籍ID
book_id = scrapy.Field()
# 书的链接地址
book_url = scrapy.Field()
# 书名
book_name = scrapy.Field()
# 作者
book_author = scrapy.Field()
# 简介
book_info = scrapy.Field()
# 封面图
cover_img_url = scrapy.Field()
# 价格
book_price = scrapy.Field()
# 标签
book_tag = scrapy.Field()
# ISBN,有空获取不到值的情况
book_isbn = scrapy.Field()
# 一级分类名
firstTitle= scrapy.Field()
# 二级分类名
secondTitle = scrapy.Field()
# 三级分类名
threeTitle = scrapy.Field()
# 四级分类名
fourTitle = scrapy.Field()
4、编写提取item数据的Spider(在spiders文件夹下:dushu.py)
# -*- coding: utf-8 -*-
# 通过一级分类页面,获取所有当前分类下的书籍,获取书籍信息后,通过面包屑导航栏来确定该书籍的四级分类
import scrapy
from ..items import DushuItem
class DushuSpider(scrapy.Spider):
name = 'dushu'
allowed_domains = ['dushu.com']
start_urls = ['https://www.dushu.com/book/']
base_url = "https://www.dushu.com"
def parse(self, response):
# 获取所有需要爬取的一级分类链接地址
firstUrls = response.xpath('//div/div[@class="row"]/div/div/dl/dt/a/@href').extract()
for url in firstUrls:
yield scrapy.Request(url=self.base_url+url,callback=self.parse_second)
def parse_second(self,response):
# 一级分类的下一页
next_pages = response.xpath('//div/div[@class="pages"]/a[not(@class="disabled")]/@href').extract()
for page in next_pages:
yield scrapy.Request(url=self.base_url+page,callback=self.parse_book)
def parse_book(self, response):
title = response.xpath('//div/div[@class="row"]/div/div/dl[@class="active"]/dt/text()').get()
next_url = response.url
next_url_page = next_url.split('_')[-1].split('.')[0]
print("准备处理[%s]第%s页" % (title, next_url_page))
# 书籍url
all_books_url = response.xpath('//div[@class="bookslist"]/ul/li/div/h3/a/@href').extract()
for url in all_books_url:
yield scrapy.Request(url=self.base_url+url,callback=self.book_detail)
def book_detail(self,response):
item = DushuItem()
# 书名
book_name = response.xpath('//div/div/div[@class="book-title"]/h1/text()').get() or "该项为空"
# with open(book_name+".html","w+",encoding="utf-8")as f:
# f.write(response.text)
# 获取书籍面包屑
navbar = response.xpath('//div[@class="crumbs"]/a[position()>2]/text()').extract()
length = len(navbar)
if length == 2:
print("该书籍一级分类:%s"%book_name)
item['firstTitle'] = navbar[0]
item['secondTitle'] = "-"
item['threeTitle'] = "-"
item['fourTitle'] = "-"
elif length == 3:
print("该书籍二级分类:%s"%book_name)
item['firstTitle'] = navbar[0]
item['secondTitle'] = navbar[1]
item['threeTitle'] = "-"
item['fourTitle'] = "-"
elif length == 4:
print("该书籍三级分类%s"%book_name)
item['firstTitle'] = navbar[0]
item['secondTitle'] = navbar[1]
item['threeTitle'] = navbar[2]
item['fourTitle'] = "-"
elif length == 5:
print("该书籍四级分类:%s"%book_name)
item['firstTitle'] = navbar[0]
item['secondTitle'] = navbar[1]
item['threeTitle'] = navbar[2]
item['fourTitle'] = navbar[3]
else:
print("该书籍分类异常:%s"%book_name)
item['firstTitle'] = "-"
item['secondTitle'] = "-"
item['threeTitle'] = "-"
item['fourTitle'] = "-"
# 作者
book_author = response.xpath('//div/div/div[@class="book-details"]/div/table//tr[1]/td[2]/text()').get()or "该项为空"
# 标签
book_tag = response.xpath('//div/div/div[@class="book-details"]/div/table//tr[4]/td[2]/text()').get()or "该项为空"
# ISBN
book_isbn = response.xpath('//div/div/div[@class="book-details"]/table//tr[1]/td[2]/text()').get() or "该项为空"
# 价格
book_price = response.xpath('//div/div/div[@class="book-details"]/div/p/span/text()').get()or "该项为空"
# 简介
book_info = response.xpath('//div/div/div[@class="book-summary"][1]/div/div/text()').get()or "该项为空"
# 封面图
cover_img_url = response.xpath('//div/div/div[@class="book-pic"]/div/img/@src').get()or "该项为空"
# 书籍详细页地址
book_url = response.url
# 书籍id
book_id = book_url.split("/")[-2]
item['book_id'] = book_id
item['book_name'] = book_name
item['book_author'] = book_author
item['book_tag'] = book_tag
item['book_isbn'] = book_isbn
item['book_price'] = book_price[1:]
item['book_info'] = book_info.strip()
item['cover_img_url'] = "暂无封面图" if "n200.png" in cover_img_url else cover_img_url
item['book_url'] = book_url
yield item
5.处理pipelines管道文件保存数据,可将结果保存到文件中(pipelines.py)
# -*- coding: utf-8 -*-
import os
import time
import json
import scrapy
import pymysql
import pymongo
import redis
from openpyxl import Workbook
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.project import get_project_settings
from scrapy import Item
from scrapy.exceptions import DropItem
class MyEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, bytes):
return str(o, encoding='utf-8')
return json.JSONEncoder.default(self, o)
class ImagePipeline(ImagesPipeline):
# 获取存放图片文件夹名字
IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
# 图片名字不能含以下特殊符号
char_list = ['*', '|', ':', '?', '/', '<', '>', '"', '\\']
def get_media_requests(self, item, info):
# 下载封面图的时候一定要Referer书的url地址,否则403
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0",
"Referer": item['book_url']}
# 无封面图的不下载
cover_img_url = item['cover_img_url']
if cover_img_url.startswith('http'):
yield scrapy.Request(cover_img_url,headers=headers)
def item_completed(self, results, item, info):
book_name = item['book_name']
if (not os.path.exists(self.IMAGES_STORE)):
os.makedirs(self.IMAGES_STORE)
# results:[(True, {'url': 'https://a.dushu.com/img/n200.png', 'path': 'full/783a2.jpg', 'checksum': '2792e5'})]
if results:
print("封面图下载:%s"% book_name)
image_path = [x['path'] for ok, x in results if ok]
for i in self.char_list:
if i in book_name:
print("'%s'包含特殊符号'%s'已转义!" % (book_name, i))
book_name = book_name.replace(i, "_")
os.rename(self.IMAGES_STORE + '/' + image_path[0], self.IMAGES_STORE + '/' + book_name + ".jpg")
else:
print("无封面图,无需下载:%s"% book_name)
return item
def close_spider(self, spider):
print("图片下载完成!")
# full最后如果是空文件夹删掉
path = self.IMAGES_STORE + '/full'
if not os.listdir(path):
os.rmdir(path)
if not os.listdir(self.IMAGES_STORE):
os.rmdir(path)
class XlsxPipeline(object):
def __init__(self):
self.wb = Workbook()
self.ws = self.wb.active
self.ws.title = "dushu网图书信息表"
self.ws.append(['book_id','一级分类','二级分类','三级分类', '四级分类', '书名',
'作者', '标签', 'ISBN', '价格(元)', '简介',
'封面图', '书的链接地址'])
def process_item(self, item, spider):
text = [item['book_id'],item['firstTitle'],item['secondTitle'],item['threeTitle'],item['fourTitle'],item['book_name'],
item['book_author'],item['book_tag'],item['book_isbn'],item['book_price'],item['book_info'],
item['cover_img_url'],item['book_url']]
self.ws.append(text)
return item
def close_spider(self,spider):
time_file = time.strftime("%Y-%m-%d",time.localtime())
self.wb.save(spider.name + time_file + ".xlsx")
print("表格数据处理完毕,谢谢使用!")
class MysqlPipeline():
@classmethod
def from_crawler(cls,crawler):
cls.MYSQL_HOST = crawler.settings.get('MYSQL_HOST')
cls.MYSQL_PORT = crawler.settings.get('MYSQL_PORT')
cls.MYSQL_USER = crawler.settings.get('MYSQL_USER')
cls.MYSQL_PASSWD = crawler.settings.get('MYSQL_PASSWD')
cls.MYSQL_DBNAME = crawler.settings.get('MYSQL_DBNAME')
cls.MYSQL_CHARSET = crawler.settings.get('MYSQL_CHARSET')
return cls()
def open_spider(self,spider):
self.db = pymysql.connect(host=self.MYSQL_HOST,port=self.MYSQL_PORT,user=self.MYSQL_USER,
passwd=self.MYSQL_PASSWD,db=self.MYSQL_DBNAME,charset=self.MYSQL_CHARSET)
self.cursor = self.db.cursor()
def process_item(self, item, spider):
try:
sql = 'CREATE TABLE IF NOT EXISTS dushu(book_id BIGINT PRIMARY KEY NOT NULL ,firstTitle VARCHAR(15),' \
'secondTitle VARCHAR(20),threeTitle VARCHAR(20),fourTitle VARCHAR(20),book_name VARCHAR(200) NOT NULL ,' \
'book_author VARCHAR(200),book_tag VARCHAR(100),book_isbn VARCHAR(50),book_price VARCHAR(20),book_info TEXT,' \
'cover_img_url VARCHAR(200),book_url VARCHAR(200))ENGINE=InnoDB DEFAULT CHARSET="utf8mb4";'
self.cursor.execute(sql)
except:
pass
try:
self.cursor.execute("SELECT book_id FROM dushu WHERE book_id = %s;" ,item['book_id'])
switch = self.cursor.fetchone()
keys,values = zip(*item.items())
if switch:
sql = """INSERT INTO dushu({})VALUES ({})ON DUPLICATE KEY UPDATE {};""".format(
','.join(keys),
','.join(['%s']*len(values)),
','.join(['{}=%s'.format(k)for k in keys])
)
self.cursor.execute(sql, values * 2)
else:
sql = """INSERT INTO dushu({})VALUES ({});""".format(
','.join(keys),
','.join(['%s'] * len(values))
)
self.cursor.execute(sql,values)
self.db.commit()
return item
except Exception as e:
print("出错了:",e)
self.db.rollback()
def close_spider(self,spider):
print("mysql数据库处理完毕")
self.cursor.close()
self.db.close()
class MongoPipeline():
@classmethod
def from_crawler(cls, crawler):
cls.MONGO_HOST = crawler.settings.get('MONGO_HOST')
cls.MONGO_PORT = crawler.settings.get('MONGO_PORT')
cls.MONGO_DB = crawler.settings.get('MONGO_DB')
return cls()
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.MONGO_HOST,self.MONGO_PORT)
# 记录插入了几条数据
self.num = 0
def process_item(self,item,spider):
try:
self.db = self.client[self.MONGO_DB]
self.book = self.db[spider.name]
# 先获取数据库中有没有数据,如果有要去判断该值是否已经存在
count = self.book.find().count()
# 集合对象的insert方法需传入一个字典对象(不能传入Item对象)
data = dict(item) if isinstance(item,Item)else item
if count == 0:
print("MongoDB数据库无数据,直接插入数据!")
self.book.insert(data)
self.num += 1
else:
book_name = item['book_name']
count = self.book.find({'book_name':book_name}).count()
if count == 0:
print("%s:添加数据库中..."%book_name)
self.book.insert(data)
self.num += 1
else:
print("%s:该数据已存在无需添加!"%book_name)
return item
except Exception as e:
print("MongoDB数据库出错:",e)
def close_spider(self,spider):
print("已完成,本次共保存到MongoDB数据库:%s条数据!" % self.num)
self.client.close()
class RedisPipeline():
@classmethod
def from_crawler(cls,crawler):
# cls.REDIS_HOST = crawler.settings.get('REDIS_HOST')
cls.REDIS_HOST = get_project_settings().get('REDIS_HOST')
cls.REDIS_PORT = crawler.settings.get('REDIS_PORT')
cls.REDIS_DBNAME = crawler.settings.get('REDIS_DBNAME')
cls.REDIS_decode_responses = crawler.settings.get('REDIS_decode_responses')
return cls()
def open_spider(self,spider):
try:
self.redis_client = redis.StrictRedis(self.REDIS_HOST,self.REDIS_PORT,self.REDIS_DBNAME,
decode_responses=self.REDIS_decode_responses)
except Exception as e:
print("redis数据库出错:",e)
def process_item(self,item,spider):
if self.redis_client.sadd('dushu:items',item['book_name']):
return item
raise DropItem
def close_spider(self,spider):
print("redis处理完成!")
6.配置settings文件(settings.py)
LOG_FILE = "dushu.log"
LOG_LEVEL = "DEBUG"
LOG_STDOUT = True
IMAGES_STORE = './images'
# 配置MYSQL
MYSQL_HOST = "localhost"
MYSQL_PORT = 3306
MYSQL_USER = "root"
MYSQL_PASSWD = "123456"
MYSQL_DBNAME = "python5"
MYSQL_CHARSET = "utf8mb4"
# 配置mongodb
MONGO_HOST = 'localhost'
MONGO_PORT = 27017
MONGO_DB = 'py4'
# 配置redis
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
REDIS_DBNAME = 4
REDIS_decode_responses = True
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);'
}
# 有针对性的选择管道文件
ITEM_PIPELINES = {
'Dushu.pipelines.ImagePipeline': 3,
'Dushu.pipelines.RedisPipeline': 300,
'Dushu.pipelines.XlsxPipeline': 301,
'Dushu.pipelines.MysqlPipeline': 302,
'Dushu.pipelines.MongoPipeline': 303,
}
7.记得提前打开mysq/redis/MongoDBl数据库,并且建立好相应的表
CREATE TABLE IF NOT EXISTS dushu(book_id BIGINT PRIMARY KEY NOT NULL ,
book_name VARCHAR(200) NOT NULL ,book_author VARCHAR(200),
book_tag VARCHAR(100),book_isbn VARCHAR(50),
book_price VARCHAR(20),book_info TEXT,
cover_img_url VARCHAR(200),
book_url VARCHAR(200))ENGINE=InnoDB DEFAULT CHARSET="utf8mb4";
8.以上设置完毕,进行爬取:执行项目命令crawl,启动Spider:
scrapy crawl dushu