1、创建Scrapy项目
scrapy startproject CbsProductRedis
2.进入项目目录,使用命令genspider创建Spider
scrapy genspider cbsproductredis XXXX.com
3、定义要抓取的数据(处理items.py文件)
# -*- coding: utf-8 -*-
# 获取cbs商家的在线产品信息,并保存到excel中
import scrapy
class CbsproductredisItem(scrapy.Item):
# 获取序号
number_list = scrapy.Field()
# 获取ID
id_list = scrapy.Field()
# 获取商家名称
company_name = scrapy.Field()
# 获取平台分类
platform_class = scrapy.Field()
# 获取商家分类
company_class = scrapy.Field()
# 获取产品名称
product_name_list = scrapy.Field()
# 获取销售情况
sale_list = scrapy.Field()
# 销售标题(有空格)
sales_title = scrapy.Field()
# 获取适应区域
product_area = scrapy.Field()
# 获取品牌
brand_list = scrapy.Field()
# 获取规格
product_size = scrapy.Field()
# 获取起订量
product_quantity = scrapy.Field()
# 获取零售价
retail_price = scrapy.Field()
# 获取零售促销价
promotion_price = scrapy.Field()
# 封面图链接地址
cover_image_link = scrapy.Field()
# 详情页图链接地址
detail_image_link = scrapy.Field()
# 获取skuid,可以不写
# skuid = scrapy.Field()
4、编写提取item数据的Spider(在spiders文件夹下:cbsproductredis.py)
# -*- coding: utf-8 -*-
# 获取cbs商家的在线产品信息,并保存到excel中
import scrapy
from CbsProductRedis.items import CbsproductredisItem
from scrapy_redis.spiders import RedisSpider
import re
import os
class CbsproductredisSpider(RedisSpider):
name = 'cbsproductredis'
allowed_domains = ['XXXX.com']
# start_urls = ['http://cbs.XXXX.com/']
redis_key = "CbsproductredisSpider:start_urls"
# 所有的产品含下架的
# lpush CbsproductredisSpider:start_urls https://cbs.XXXX.com/item-p=1
# 在线的产品
# lpush CbsproductredisSpider:start_urls https://cbs.XXXX.com/item/itemonline-p=1
login_page = 'https://cbs.XXXXX.com/login.html'
def start_requests(self):
yield scrapy.Request(url=self.login_page,callback=self.login)
def login(self,response):
self.username = input("请输入账号:")
self.password = input("请输入密码:")
yield scrapy.FormRequest.from_response(
response,
formdata={"j_username":self.username,"j_password": self.password},
callback = self.parse_page
)
# 获取登录成功的状态,访问需要登录后才能访问的页面
def parse_page(self, response):
if "errorMsg" in response.body.decode('utf-8'):
print("登录失败,错误的手机号码或密码!")
if "</span>首页" in response.body.decode('utf-8'):
print("欢迎您'%s',成功登录CBS管理系统!" % (self.username))
print("请在slaver端(爬虫程序执行端)输入:lpush %s 爬取列表页网址" % (self.redis_key))
# 登录成功后获取在线产品的列表页,并回调parse()函数处理数据
yield scrapy.Request('https://cbs.XXXX.com/system/company.html', callback=self.parse_company)
# 处理商家名称
def parse_company(self,response):
# items=[]
# 获取商家名称,首尾有空格,用meta不知道为何传不过去,所以用self来传送
company_name = response.xpath("//div[@id='tabs-1']/p[1]/span/text()").extract()
self.filename = company_name[0].strip()
def parse(self, response):
items = []
# 获取下一页,需要和‘https://cbs.XXXX.com/item/itemon.html’进行拼接
next_url_list = response.xpath('//body//div//div/span/span[@class="paginate_button"]/a/@href').extract()
for each in response.xpath('//div[@class="dataTables_wrapper"]'):
# 序号
number_list = each.xpath('.//td[1]/text()').extract()
# 获取品牌
brand_list = each.xpath('.//td[2]/text()').extract()
# 获取产品名称
product_name_list = each.xpath('.//td[3]/a/text()').extract()
# 获取商家分类
company_class = each.xpath('.//td[4]/text()').extract()
# 获取平台分类
platform_class = each.xpath('.//td[5]/text()').extract()
# 获取ID,需要处理,javascript:show('105047')
id_list = each.xpath('.//td[3]/a/@href').extract()
pattern_id = re.compile(r"javascript:show\('(\d+)'\)")
for i in range(len(id_list)):
item = CbsproductredisItem()
item['number_list'] = number_list[i].strip()
item['brand_list'] = brand_list[i].strip()
item['company_class'] = company_class[i].strip()
item['platform_class'] = platform_class[i].strip()
item['product_name_list'] = product_name_list[i].strip()
# 处理ID
item['id_list'] = pattern_id.search(id_list[i]).group(1)
item['company_name'] = self.filename
items.append(item)
# print(items)
for item in items:
id_url = 'https://cbs.XXXX.com/item/show.html?item.id='+item['id_list']
yield scrapy.Request(url=id_url,meta={'meta_2':item},callback=self.parse_id)
# 处理获取的第几页,?d-49-p=1
pattern_next_url = re.compile(r"/?d-49-p=(\d+)")
if len(next_url_list) == 0:
print("第1页数据处理中....")
else:
for url in next_url_list:
i =pattern_next_url.search(url).group(1)
print("第%s页数据处理中...."%i)
fullurl = 'https://cbs.XXXX.com/item/itemon.html'+ str(url)
yield scrapy.Request(url=fullurl,callback=self.parse)
# 处理id链接,获取价格、规格、起订量等信息
def parse_id(self,response):
meta_2 = response.meta['meta_2']
item = CbsproductredisItem()
# 获取销售标题
sales_title = response.xpath('//div[@id="tabs-1"]/p[8]/span[@class="field"]/text()').extract()
# 获取适应区域
product_area = response.xpath('//div[@id="tabs-1"]/p[2]/span/text()').extract()
# 获取销售情况(有空格)
sale_list = response.xpath('//div[@id="tabs-1"]/p[6]/span/text()').extract()
# 获取封面图链接地址
cover_image_link = response.xpath('//div[@id="tabs"]/div[@id="tabs-3"]/div/span/img/@src').extract()
# 获取详情页图链接地址
detail_image_link = response.xpath('//div[@id="tabs"]/div[@id="tabs-4"]//@src').extract()
# 获取规格
product_size = response.xpath('//div[@id="tabs"]/div[@id="tabs-5"]/table/tbody/tr/td[1]/text()').extract()
# 获取起订量
product_quantity = response.xpath('//div[@id="tabs"]/div[@id="tabs-5"]/table/tbody/tr/td[8]/text()').extract()
# 获取规格对应的skuid号码》》》javascript:show('688')
skuid_list = response.xpath('//div[@id="tabs"]/div[@id="tabs-5"]/table/tbody/tr/td[9]/a/@href').extract()
# 对skuid_list结果进行正则匹配出数字
pattent = re.compile("\d+")
# 封面图个数不固定,但是为了图片管道处理方便,采用列表形式,容易相加
item['cover_image_link'] = cover_image_link
# 描述图可能也为多个,但是为了图片管道处理方便,采用列表形式,容易相加
item['detail_image_link'] = detail_image_link
# 有规格必然有起订量,这两个是一一对应,并且是必填项不可能为空,只要是在线的产品都会有规格和起订量
for i in range(len(product_size)):
item['sales_title'] = sales_title[0].strip()
item['sale_list'] = sale_list[0].strip()
item['product_area'] = product_area[0].strip()
item['product_size'] = product_size[i]
item['product_quantity'] = product_quantity[i]
item['number_list'] = meta_2['number_list']
item['id_list'] = meta_2['id_list']
item['company_class'] = meta_2['company_class']
item['platform_class'] = meta_2['platform_class']
item['product_name_list'] = meta_2['product_name_list']
item['brand_list'] = meta_2['brand_list']
item['company_name'] = meta_2['company_name']
# 提取javascript:show('688')里面skuid号码
skuid_number = pattent.search(skuid_list[i]).group()
# 可以把skuid保存下来,这里无用就不保存了
# item['skuid'] = skuid_number
skuid_url = "https://cbs.XXXX.com/item/showi.html?sku.id=" + skuid_number
yield scrapy.Request(url=skuid_url, meta={'meta_3': item}, callback=self.parse_skuid)
def parse_skuid(self,response):
# 提取每次response的meta数据
meta_3 = response.meta['meta_3']
item = CbsproductredisItem()
# 零售价,将重复的价格筛选掉,用set去掉重复项,并转换为列表
retail_price_list = response.xpath('//div[@id="tabs-1"]/table[@id="item"]/tbody/tr/td[2]/text()').extract()
retail_price = list(set(retail_price_list))
for i in range(len(retail_price)):
if retail_price[i] == "0.0":
retail_price[i] = '零售价待定'
elif retail_price[i] == "0.00":
retail_price[i] = '零售价数据0.00有误'
# 如果是多个价格,用分号隔开
if len(retail_price) > 1:
item['retail_price'] = ";".join(retail_price)
elif len(retail_price) == 1:
item['retail_price'] = retail_price[0]
# 获取零售促销价,将重复的促销价筛选掉,用set去掉重复项,并转换为列表
promotion_price_list = response.xpath('//div[@id="tabs-1"]/table[@id="item"]/tbody/tr/td[3]/text()').extract()
promotion_price = list(set(promotion_price_list))
for i in range(len(promotion_price)):
if promotion_price[i] == "0.0":
promotion_price[i] = '无促销价'
elif promotion_price[i] == "0.00":
promotion_price[i] = '促销价数据0.00有误'
# 如果是多个促销价格,用分号隔开
if len(promotion_price) > 1:
item['promotion_price'] = ";".join(promotion_price)
elif len(promotion_price) == 1:
item['promotion_price'] = promotion_price[0]
item['number_list'] = meta_3['number_list']
# item['skuid'] = meta_3['skuid']
item['brand_list'] = meta_3['brand_list']
item['product_area'] = meta_3['product_area']
item['product_name_list'] = meta_3['product_name_list']
item['company_class'] = meta_3['company_class']
item['platform_class'] = meta_3['platform_class']
item['sale_list'] = meta_3['sale_list']
item['sales_title'] = meta_3['sales_title']
item['product_size'] = meta_3['product_size']
item['product_quantity'] = meta_3['product_quantity']
item['cover_image_link'] = meta_3['cover_image_link']
item['detail_image_link'] = meta_3['detail_image_link']
item['id_list'] = meta_3['id_list']
item['company_name'] = meta_3['company_name']
yield item
5.处理pipelines管道文件保存数据,可将结果保存到文件中(pipelines.py)
# -*- coding: utf-8 -*-
import time
import json
from openpyxl import Workbook
import os,shutil
import scrapy
from scrapy.utils.project import get_project_settings
from scrapy.pipelines.images import ImagesPipeline
# 继承ImagesPipeline()的子类处理图片并保存,参考:https://blog.csdn.net/z564359805/article/details/80693578
class ImagePipeline(ImagesPipeline):
# 这个列表保存所有封面图和详情页图链接地址的最后部分:[isi15172790875423.png]
image_url_list = []
# 获取settings文件中设置的图片保存地址IMAGES_STORE: ./images
IMAGES_STORE =get_project_settings().get("IMAGES_STORE")
def get_media_requests(self, item, info):
# 这里我们将封面图和详情页图一起放到request里,一起处理,多个规格的只处理一次,set去重
image_url_request = list(set(item['cover_image_link'] + item['detail_image_link']))
for image_url in image_url_request:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
# 封面图和详情页图都是列表,得到总长度
length = len(item['cover_image_link']) + len(item['detail_image_link'])
for i in range(length):
# 产品名称目录 ./images/北京科技发展有限公司/番茄10号
self.IMAGES_filename = self.IMAGES_STORE + "/" + item['company_name'] + "/" + item['product_name_list']
# 如果目录不存在则创建
if (not os.path.exists(self.IMAGES_filename)):
os.makedirs(self.IMAGES_filename)
# image_url是列表,包含封面图和详情页图的链接地址
image_url = (item['cover_image_link'] + item['detail_image_link'])[i]
print('\n' + "正在处理图片,链接地址为:%s" % image_url)
# 截取图片地址的最后一个/以后的部分.../isi/isi15172790875423.png中的:[isi15172790875423.png],列表形式
end_image_url = image_url.split("/")[-1:]
# 判断end_image_url是否在列表中,如果不存在才去保存图片,这样可以避免多个规格重复下载同样图而报找不到文件错的现象
# 不能用ID去判断,第一次执行封面图的时候就会保存ID,这样会导致多张封面图只能保存一张的情况出现
if end_image_url not in self.image_url_list:
# self.image_url_list = [['isi15172178775323.png'], ['img15172798785790.jpg']]
self.image_url_list.append(end_image_url)
image_path = [x['path'] for ok, x in results if ok]
# print("----",image_path) #列表 ['full/9b43463a5f9009a92c21cc8580e692fb90653fdc.jpg', 'full/a31a354b920bf7332a715e4ecda7095faf8bedfd.jpg']
# 后台封面图片格式只会是jpg/png/gif,不会是jpeg,但详情图可能是
if image_url[-4:].lower() in ['.jpg','.gif','.png']:
# ./images/full/9b43463a5f9009a92c21cc8580e692fb90653fdc.jpg
# ./images/北京科技发展有限公司/番茄10号/图-1.jpg
os.rename(self.IMAGES_STORE + '/' + image_path[i],self.IMAGES_filename + '/图-' + str(i+1) +image_url[-4:])
# 如果不是上述图片三种格式,直接修改为.jpg
else:
os.rename(self.IMAGES_STORE + '/' + image_path[i],self.IMAGES_filename + '/图-' + str(i+1) + ".jpg")
return item
# 转码操作
class MyEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, bytes):
return str(o, encoding='utf-8')
return json.JSONEncoder.default(self, o)
# 保存到excel表格
class CbsproductredisPipeline(object):
def __init__(self):
self.wb = Workbook()
self.ws = self.wb.active
# 创建表头
self.ws.append(['序号', 'ID', '品牌', '产品名称',
'商家分类', '平台分类', '销售情况', '销售标题',
'价格','促销价', '规格', '起订量',
'适应区域'
])
def process_item(self, item, spider):
try:
text = [item['number_list'], item['id_list'], item['brand_list'], item['product_name_list'],
item['company_class'], item['platform_class'], item['sale_list'], item['sales_title'],
item['retail_price'], item['promotion_price'], item['product_size'], item['product_quantity'],
item['product_area']
]
self.ws.append(text)
# 给保存的文件名字加上个当天的日期年月日
self.file_end_name = item['company_name']+time.strftime("%Y-%m-%d", time.localtime())
except Exception as e:
print("错误",e)
return item
def close_spider(self, spider):
# 给保存的文件名字加上个当天的日期年月日
# file_end_name = time.strftime("%Y-%m-%d", time.localtime())
self.wb.save(self.file_end_name+'.xlsx')
print("数据处理完成,谢谢使用!")
# 最后删除ImagePipeline自动创建的full文件夹
shutil.rmtree("./images/full")
6.配置settings文件(settings.py)
# 使用scrapy-redis里的去重组件,不再使用scrapy默认的去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用了scrapy-redis里的调度器组件,不再使用scrapy默认的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 允许暂停,redis请求记录不丢失
SCHEDULER_PERSIST = True
# 图片保存的根目录
IMAGES_STORE = "./images"
DOWNLOAD_DELAY = 3
ROBOTSTXT_OBEY = False
# 不写默认存储到本地数据库
# REDIS_HOST = "192.168.0.109"
# REDIS_PORT = 6379
# 默认的scrapy-redis请求队列形式
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
# 队列形式,先进先出,选这个会报错:Unhandled error in Deferred
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
# 栈形式,先进后出
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
# Configure item pipelines去掉下面注释,打开管道文件,添加RedisPipeline
ITEM_PIPELINES = {
'CbsProductRedis.pipelines.CbsproductredisPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400,
'CbsProductRedis.pipelines.ImagePipeline': 3,
}
# Obey robots.txt rules,具体含义参照:https://blog.csdn.net/z564359805/article/details/80691677
ROBOTSTXT_OBEY = False
# Override the default request headers:添加User-Agent信息
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
}
# 还可以将日志存到本地文件中(可选添加设置)
LOG_FILE = "cbsproductredis.log"
LOG_LEVEL = "DEBUG"
# 包含print全部放在日志中
LOG_STDOUT = True
7.参照以下链接打开redis数据库:
https://www.cdsy.xyz/computer/soft/database/redis/230308/cd41219.html
8.以上设置完毕,进行爬取:进入到spiders文件夹下执行项目命令,启动Spider:
scrapy runspider cbsproductredis.py
9.在Master端(核心服务器)的redis-cli输入push指令,参考格式:
输入:lpush CbsproductredisSpider:start_urls https://cbs.XXXX.com/item-p=1