爬虫--Scrapy(实战多item多表存储)
- #settings.py
- BOT_NAME = 'xpc'
-
- SPIDER_MODULES = ['xpc.spiders']
- NEWSPIDER_MODULE = 'xpc.spiders'
-
-
- USER_AGENT = 'xpc (+http://www.yourdomain.com)'
-
-
- ROBOTSTXT_OBEY = False
-
-
- DOWNLOAD_DELAY = 1
-
-
- ITEM_PIPELINES = {
- 'xpc.pipelines.XpcPipeline': 300,
- }
-
-
-
-
-
- #爬虫文件.py中
- import os
- import re
-
- import scrapy
-
- from xpc.items import *
-
-
- class MyxpcSpider(scrapy.Spider):
- name = 'myxpc'
- allowed_domains = ['xinpianchang.com']
- start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=navigator']
-
- # 主页数据
- def parse(self, response, **kwargs):
- print('*' * 60)
- print(len(response.text))
-
- # 视频列表
- li_list = response.xpath('//div[@class="channel-con"]/ul[@class="video-list"]/li')
-
- for li in li_list:
- # 作品id
- pid = li.xpath('./@data-articleid').get()
- # 作品标题
- title = li.xpath('./div[@class="video-con"]/div[@class="video-con-top"]/a/p/text()').get()
- # 视频缩略图
- thumbnail = li.xpath('./a/img/@_src').get()
- # 作品类型
- category = li.xpath('./div[@class="video-con"]/div[@class="video-con-top"]/div[@class="new-cate"]/span[@class="fs_12 fw_300 c_b_9"]/text()').get()
- # 发表时间
- created_at = li.xpath('./a/div[@class="video-hover-con"]/p/text()').get()
- # 作品描述
- description = li.xpath('./a/div[@class="video-hover-con"]/div/text()').get()
- # 被点赞次数
- like_counts = li.xpath('./div[@class="video-con"]//span[@class="fw_300 c_b_9 icon-like"]/text()').get()
-
- # print(pid, title, thumbnail, category, created_at, description, like_counts)
-
- # item
- post_item = PostItem()
- post_item['pid'] = pid
- post_item['title'] = title
- post_item['thumbnail'] = thumbnail
- post_item['category'] = category
-
- post_item['created_at'] = created_at
- post_item['description'] = description
- post_item['like_counts'] = like_counts
-
- # 视频详情页
- post_url = 'https://www.xinpianchang.com/a%s?from=ArticleList' % pid
- # 请求详情页数据
- request = scrapy.Request(url=post_url, callback=self.post_detail)
- request.meta['post_item'] = post_item
- yield request
-
- # 视频详情页
- def post_detail(self, response):
- post_item = response.meta.get('post_item')
- pid = post_item['pid']
-
- # 播放次数
- play_counts = response.xpath('//i[@class="fs_12 fw_300 c_b_6 v-center play-counts"]/@data-curplaycounts').get()
- post_item['play_counts'] = play_counts
-
- # 视频
- # vid
- vid = re.findall('vid = "(.*?)"', response.text)[0]
- # print(vid)
- # 视频url
- video_url = 'https://mod-api.xinpianchang.com/mod/api/v2/media/%s?appKey=61a2f329348b3bf77' % vid
- # print(video_url)
-
- # 请求视频数据
- request = scrapy.Request(url=video_url, callback=self.vido_detail)
- request.meta['post_item'] = post_item
- yield request
-
- # 获取创作者的数据
- li_list = response.xpath('//div[@class="filmplay-creator right-section"]/ul[@class="creator-list"]/li')
- for li in li_list:
- cid = li.xpath('./a/@data-userid').get() # 作者id
-
- # item
- composer_item = ComposersItem()
- composer_item['cid'] = cid
-
- # 进入到创作者的个人主页去获取其他数据
- href = li.xpath('./a/@href').get()
- href = "https://www.xinpianchang.com/" + href
- request2 = scrapy.Request(url=href, callback=self.composer_detail)
- request2.meta['composer_item'] = composer_item
- yield request2
-
-
- # 版权copyright
- copyrights_item = CopyrightsItem()
- copyrights_item['pcid'] = f'{pid}_{cid}'
- copyrights_item['pid'] = pid
- copyrights_item['cid'] = cid
- roles = li.xpath('.//span[@class="roles fs_12 fw_300 c_b_9"]/text()').get()
- copyrights_item['roles'] = roles
- yield copyrights_item
-
-
- # 评论数据
- comment_url = 'https://app.xinpianchang.com/comments?resource_id=%s&type=article&page=1&per_page=24' % pid
- request3 = scrapy.Request(url=comment_url, callback=self.comment_detail)
- yield request3
-
- # 视频数据
- def vido_detail(self, response):
- post_item = response.meta.get('post_item')
-
- # json解析
- content = response.json()
-
- # 播放时长
- duration = content['data']['duration']
- # 视频预览图
- preview = content['data']['cover']
- # 视频链接
- video = content['data']['resource']['progressive'][0]['url']
- # 视频格式
- video_format = content['data']['resource']['progressive'][0]['mime']
-
-
- # item
- post_item['duration'] = duration
- post_item['preview'] = preview
- post_item['video'] = video
- post_item['video_format'] = video_format
- yield post_item
-
- # 创作者数据
- def composer_detail(self, response):
- composer_item = response.meta['composer_item']
-
- # banner图
- banner_style = response.xpath('//div[@class="banner-wrap"]/@style').get()
- # print(banner_style)
- # background-image:url(https://cs.xinpianchang.com/app/user/html/stars/css/skin46/bg-big.jpg)
- banner = banner_style[banner_style.find("https"): -1]
- # print(banner)
-
- # 头像
- avatar = response.xpath('//span[@class="avator-wrap-s"]/img/@src').get()
-
- # 是否加V
- verified = response.xpath('//span[@class="avator-wrap-s"]/span[contains(@class, "author-v")]').get()
- # print('verified: ', verified)
- verified = 'yes' if verified else 'no'
-
- # 名字
- name = response.xpath('//p[@class="creator-name fs_26 fw_600 c_b_26"]/text()').get()
- # print(name)
-
- # 自我介绍
- intro = response.xpath('//p[@class="creator-desc fs_14 fw_300 c_b_3 line-hide-1"]/text()').get()
-
- # 被点赞次数
- like_counts = response.xpath('//span[@class="like-counts fw_600 v-center"]/text()').get()
- like_counts = like_counts.replace(',', "")
-
- # 被关注数量
- fans_counts = response.xpath('//span[@class="fans-counts fw_600 v-center"]/text()').get()
- fans_counts = fans_counts.replace(',', "")
-
- # 关注数量
- follow_counts = response.xpath('//span[@class="fw_600 v-center"]/text()').get()
- follow_counts = follow_counts.replace(',', "")
-
- # 所在位置
- location = response.xpath('//p[@class="creator-detail fs_14 fw_300 c_b_9"]/span[@class="v-center"][1]/text()').get()
-
- # 职业
- career = response.xpath('//p[@class="creator-detail fs_14 fw_300 c_b_9"]/span[@class="v-center"][2]/text()').get()
-
- composer_item['banner'] = banner
- composer_item['avatar'] = avatar
- composer_item['verified'] = verified
- composer_item['name'] = name
- composer_item['intro'] = intro
- composer_item['like_counts'] = like_counts
- composer_item['fans_counts'] = fans_counts
- composer_item['follow_counts'] = follow_counts
- composer_item['location'] = location
- composer_item['career'] = career
- yield composer_item
-
- # print(composer_item)
-
- # 评论数据
- def comment_detail(self, response):
- # 获取评论数据
- content = response.json()
- comment_list = content['data']['list']
- for comment in comment_list:
- item = CommentsItem()
- item['commentid'] = comment['id']
- item['pid'] = comment['resource_id']
- item['cid'] = comment['userid']
- item['avatar'] = comment['userInfo']['avatar']
- item['uname'] = comment['userInfo']['username']
- item['created_at'] = comment['addtime']
- item['content'] = comment['content']
- item['like_counts'] = comment['count_approve']
- if comment.get('referer'):
- item['reply'] = comment['referer']['id']
- else:
- item['reply'] = '0'
-
- yield item
- # print(item)
-
-
-
-
- #items.py中
- from scrapy import Item, Field
-
-
- # 作品(短视频)
- class PostItem(Item):
- table_name = 'posts'
- pid = Field() # 作品表主键
- title = Field() # 作品标题
- thumbnail = Field() # 视频缩略图
- category = Field() # 作品分类
-
- created_at = Field() # 发表时间
- description = Field() # 作品描述
- like_counts = Field() # 被点赞次数
-
- duration = Field() # 播放时长
- preview = Field() # 视频预览图
- video = Field() # 视频链接
- video_format = Field() # 视频格式
- play_counts = Field() # 播放次数
-
-
- # 作者
- class ComposersItem(Item):
- table_name = 'composers'
- cid = Field() # 创作者表主键
- banner = Field() # 用户主页banner图片
- avatar = Field() # 用户头像
- verified = Field() # 是否加V
-
- name = Field() # 名字
- intro = Field() # 自我介绍
- like_counts = Field() # 被点赞次数
- fans_counts = Field() # 被关注数量
-
- follow_counts = Field() # 关注数量
- location = Field() # 所在位置
- career = Field() # 职业
-
-
- # 评论
- class CommentsItem(Item):
- table_name = 'comments'
- commentid = Field() # 评论表主键
- pid = Field() # 评论的作品ID
-
- cid = Field() # 评论人ID
- avatar = Field() # 评论人头像
- uname = Field() # 评论人名称
-
- created_at = Field() # 发表时间
- content = Field() # 评论内容
- like_counts = Field() # 被点赞次数
- reply = Field() # 回复其他评论的ID,如果不是则为0
-
-
- # 版权
- class CopyrightsItem(Item):
- table_name = 'copyrights'
- pcid = Field() # 主键,由pid_cid组成
- pid = Field() # 对应作品表主键
- cid = Field() # 对应作者表主键
- roles = Field() # 担任角色
-
-
-
-
-
-
-
- #pipelines.py中(核心)
- import os
- from itemadapter import ItemAdapter
- import pymysql
- from urllib import request
-
- class XpcPipeline:
-
- # 开启爬虫时: 连接MySQL
- def open_spider(self, spider):
- self.db = pymysql.connect(
- host='localhost', port=3306,
- user='root', password='root',
- database='xpc_2020', charset='utf8mb4'
- )
- self.cursor = self.db.cursor()
-
- # 关闭爬虫时: 关闭连接MySQL
- def close_spider(self, spider):
- self.cursor.close()
- self.db.close()
-
- # 处理数据
- def process_item(self, item, spider):
-
- table_name = item.table_name
- keys = list(item.keys()) # ['pcid', 'pid', 'cid', 'roles']
- values = list(item.values()) # ['333', '222', '111', '制作方']
-
- # 所有字段组成的字符串
- key_str = ','.join(['`%s`'%k for k in keys])
- # print(key_str) # "`pcid`,`pid`,`cid`,`roles`"
-
- # 值组成的字符串
- values_str = ','.join(["%s"] * len(values))
- print(values_str) # "%s,%s,%s,%s"
-
- # update字符串
- update_str = ','.join(["`{}`=%s".format(k) for k in keys])
- print(update_str) # "`pcid`=%s,`pid`=%s,`cid`=%s,`roles`=%s"
-
- # SQL
- sql = 'insert into `{}`({}) values({}) on duplicate key update {}'.format(
- table_name,
- key_str,
- values_str,
- update_str
- )
- # print(sql)
- # insert into `copyrights`(`pcid`,`pid`,`cid`,`roles`)
- # values(%s,%s,%s,%s)
- # on duplicate key update `pcid`=%s,`pid`=%s,`cid`=%s,`roles`=%s
-
- # 执行SQL
- self.cursor.execute(sql, values*2)
- # values * 2 =
- # ['333', '222', '111', '制作方', '333', '222', '111', '制作方']
- self.db.commit()
- print(f'----- 插入成功: {table_name} -----')
-
- # on duplicate key update
- # 作用: 当向数据库插入新记录时:
- # 1. 如果表中的主键或唯一约束对应的值已经存在,则执行更新操作
- # 2. 否则就插入新记录,不更新
-
- return item
-