爬虫--Scrapy(实战多item多表存储)
#settings.py
BOT_NAME = 'xpc'
SPIDER_MODULES = ['xpc.spiders']
NEWSPIDER_MODULE = 'xpc.spiders'
USER_AGENT = 'xpc (+http://www.yourdomain.com)'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1
ITEM_PIPELINES = {
'xpc.pipelines.XpcPipeline': 300,
}
#爬虫文件.py中
import os
import re
import scrapy
from xpc.items import *
class MyxpcSpider(scrapy.Spider):
name = 'myxpc'
allowed_domains = ['xinpianchang.com']
start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=navigator']
# 主页数据
def parse(self, response, **kwargs):
print('*' * 60)
print(len(response.text))
# 视频列表
li_list = response.xpath('//div[@class="channel-con"]/ul[@class="video-list"]/li')
for li in li_list:
# 作品id
pid = li.xpath('./@data-articleid').get()
# 作品标题
title = li.xpath('./div[@class="video-con"]/div[@class="video-con-top"]/a/p/text()').get()
# 视频缩略图
thumbnail = li.xpath('./a/img/@_src').get()
# 作品类型
category = li.xpath('./div[@class="video-con"]/div[@class="video-con-top"]/div[@class="new-cate"]/span[@class="fs_12 fw_300 c_b_9"]/text()').get()
# 发表时间
created_at = li.xpath('./a/div[@class="video-hover-con"]/p/text()').get()
# 作品描述
description = li.xpath('./a/div[@class="video-hover-con"]/div/text()').get()
# 被点赞次数
like_counts = li.xpath('./div[@class="video-con"]//span[@class="fw_300 c_b_9 icon-like"]/text()').get()
# print(pid, title, thumbnail, category, created_at, description, like_counts)
# item
post_item = PostItem()
post_item['pid'] = pid
post_item['title'] = title
post_item['thumbnail'] = thumbnail
post_item['category'] = category
post_item['created_at'] = created_at
post_item['description'] = description
post_item['like_counts'] = like_counts
# 视频详情页
post_url = 'https://www.xinpianchang.com/a%s?from=ArticleList' % pid
# 请求详情页数据
request = scrapy.Request(url=post_url, callback=self.post_detail)
request.meta['post_item'] = post_item
yield request
# 视频详情页
def post_detail(self, response):
post_item = response.meta.get('post_item')
pid = post_item['pid']
# 播放次数
play_counts = response.xpath('//i[@class="fs_12 fw_300 c_b_6 v-center play-counts"]/@data-curplaycounts').get()
post_item['play_counts'] = play_counts
# 视频
# vid
vid = re.findall('vid = "(.*?)"', response.text)[0]
# print(vid)
# 视频url
video_url = 'https://mod-api.xinpianchang.com/mod/api/v2/media/%s?appKey=61a2f329348b3bf77' % vid
# print(video_url)
# 请求视频数据
request = scrapy.Request(url=video_url, callback=self.vido_detail)
request.meta['post_item'] = post_item
yield request
# 获取创作者的数据
li_list = response.xpath('//div[@class="filmplay-creator right-section"]/ul[@class="creator-list"]/li')
for li in li_list:
cid = li.xpath('./a/@data-userid').get() # 作者id
# item
composer_item = ComposersItem()
composer_item['cid'] = cid
# 进入到创作者的个人主页去获取其他数据
href = li.xpath('./a/@href').get()
href = "https://www.xinpianchang.com/" + href
request2 = scrapy.Request(url=href, callback=self.composer_detail)
request2.meta['composer_item'] = composer_item
yield request2
# 版权copyright
copyrights_item = CopyrightsItem()
copyrights_item['pcid'] = f'{pid}_{cid}'
copyrights_item['pid'] = pid
copyrights_item['cid'] = cid
roles = li.xpath('.//span[@class="roles fs_12 fw_300 c_b_9"]/text()').get()
copyrights_item['roles'] = roles
yield copyrights_item
# 评论数据
comment_url = 'https://app.xinpianchang.com/comments?resource_id=%s&type=article&page=1&per_page=24' % pid
request3 = scrapy.Request(url=comment_url, callback=self.comment_detail)
yield request3
# 视频数据
def vido_detail(self, response):
post_item = response.meta.get('post_item')
# json解析
content = response.json()
# 播放时长
duration = content['data']['duration']
# 视频预览图
preview = content['data']['cover']
# 视频链接
video = content['data']['resource']['progressive'][0]['url']
# 视频格式
video_format = content['data']['resource']['progressive'][0]['mime']
# item
post_item['duration'] = duration
post_item['preview'] = preview
post_item['video'] = video
post_item['video_format'] = video_format
yield post_item
# 创作者数据
def composer_detail(self, response):
composer_item = response.meta['composer_item']
# banner图
banner_style = response.xpath('//div[@class="banner-wrap"]/@style').get()
# print(banner_style)
# background-image:url(https://cs.xinpianchang.com/app/user/html/stars/css/skin46/bg-big.jpg)
banner = banner_style[banner_style.find("https"): -1]
# print(banner)
# 头像
avatar = response.xpath('//span[@class="avator-wrap-s"]/img/@src').get()
# 是否加V
verified = response.xpath('//span[@class="avator-wrap-s"]/span[contains(@class, "author-v")]').get()
# print('verified: ', verified)
verified = 'yes' if verified else 'no'
# 名字
name = response.xpath('//p[@class="creator-name fs_26 fw_600 c_b_26"]/text()').get()
# print(name)
# 自我介绍
intro = response.xpath('//p[@class="creator-desc fs_14 fw_300 c_b_3 line-hide-1"]/text()').get()
# 被点赞次数
like_counts = response.xpath('//span[@class="like-counts fw_600 v-center"]/text()').get()
like_counts = like_counts.replace(',', "")
# 被关注数量
fans_counts = response.xpath('//span[@class="fans-counts fw_600 v-center"]/text()').get()
fans_counts = fans_counts.replace(',', "")
# 关注数量
follow_counts = response.xpath('//span[@class="fw_600 v-center"]/text()').get()
follow_counts = follow_counts.replace(',', "")
# 所在位置
location = response.xpath('//p[@class="creator-detail fs_14 fw_300 c_b_9"]/span[@class="v-center"][1]/text()').get()
# 职业
career = response.xpath('//p[@class="creator-detail fs_14 fw_300 c_b_9"]/span[@class="v-center"][2]/text()').get()
composer_item['banner'] = banner
composer_item['avatar'] = avatar
composer_item['verified'] = verified
composer_item['name'] = name
composer_item['intro'] = intro
composer_item['like_counts'] = like_counts
composer_item['fans_counts'] = fans_counts
composer_item['follow_counts'] = follow_counts
composer_item['location'] = location
composer_item['career'] = career
yield composer_item
# print(composer_item)
# 评论数据
def comment_detail(self, response):
# 获取评论数据
content = response.json()
comment_list = content['data']['list']
for comment in comment_list:
item = CommentsItem()
item['commentid'] = comment['id']
item['pid'] = comment['resource_id']
item['cid'] = comment['userid']
item['avatar'] = comment['userInfo']['avatar']
item['uname'] = comment['userInfo']['username']
item['created_at'] = comment['addtime']
item['content'] = comment['content']
item['like_counts'] = comment['count_approve']
if comment.get('referer'):
item['reply'] = comment['referer']['id']
else:
item['reply'] = '0'
yield item
# print(item)
#items.py中
from scrapy import Item, Field
# 作品(短视频)
class PostItem(Item):
table_name = 'posts'
pid = Field() # 作品表主键
title = Field() # 作品标题
thumbnail = Field() # 视频缩略图
category = Field() # 作品分类
created_at = Field() # 发表时间
description = Field() # 作品描述
like_counts = Field() # 被点赞次数
duration = Field() # 播放时长
preview = Field() # 视频预览图
video = Field() # 视频链接
video_format = Field() # 视频格式
play_counts = Field() # 播放次数
# 作者
class ComposersItem(Item):
table_name = 'composers'
cid = Field() # 创作者表主键
banner = Field() # 用户主页banner图片
avatar = Field() # 用户头像
verified = Field() # 是否加V
name = Field() # 名字
intro = Field() # 自我介绍
like_counts = Field() # 被点赞次数
fans_counts = Field() # 被关注数量
follow_counts = Field() # 关注数量
location = Field() # 所在位置
career = Field() # 职业
# 评论
class CommentsItem(Item):
table_name = 'comments'
commentid = Field() # 评论表主键
pid = Field() # 评论的作品ID
cid = Field() # 评论人ID
avatar = Field() # 评论人头像
uname = Field() # 评论人名称
created_at = Field() # 发表时间
content = Field() # 评论内容
like_counts = Field() # 被点赞次数
reply = Field() # 回复其他评论的ID,如果不是则为0
# 版权
class CopyrightsItem(Item):
table_name = 'copyrights'
pcid = Field() # 主键,由pid_cid组成
pid = Field() # 对应作品表主键
cid = Field() # 对应作者表主键
roles = Field() # 担任角色
#pipelines.py中(核心)
import os
from itemadapter import ItemAdapter
import pymysql
from urllib import request
class XpcPipeline:
# 开启爬虫时: 连接MySQL
def open_spider(self, spider):
self.db = pymysql.connect(
host='localhost', port=3306,
user='root', password='root',
database='xpc_2020', charset='utf8mb4'
)
self.cursor = self.db.cursor()
# 关闭爬虫时: 关闭连接MySQL
def close_spider(self, spider):
self.cursor.close()
self.db.close()
# 处理数据
def process_item(self, item, spider):
table_name = item.table_name
keys = list(item.keys()) # ['pcid', 'pid', 'cid', 'roles']
values = list(item.values()) # ['333', '222', '111', '制作方']
# 所有字段组成的字符串
key_str = ','.join(['`%s`'%k for k in keys])
# print(key_str) # "`pcid`,`pid`,`cid`,`roles`"
# 值组成的字符串
values_str = ','.join(["%s"] * len(values))
print(values_str) # "%s,%s,%s,%s"
# update字符串
update_str = ','.join(["`{}`=%s".format(k) for k in keys])
print(update_str) # "`pcid`=%s,`pid`=%s,`cid`=%s,`roles`=%s"
# SQL
sql = 'insert into `{}`({}) values({}) on duplicate key update {}'.format(
table_name,
key_str,
values_str,
update_str
)
# print(sql)
# insert into `copyrights`(`pcid`,`pid`,`cid`,`roles`)
# values(%s,%s,%s,%s)
# on duplicate key update `pcid`=%s,`pid`=%s,`cid`=%s,`roles`=%s
# 执行SQL
self.cursor.execute(sql, values*2)
# values * 2 =
# ['333', '222', '111', '制作方', '333', '222', '111', '制作方']
self.db.commit()
print(f'----- 插入成功: {table_name} -----')
# on duplicate key update
# 作用: 当向数据库插入新记录时:
# 1. 如果表中的主键或唯一约束对应的值已经存在,则执行更新操作
# 2. 否则就插入新记录,不更新
return item