2025年3月24日 星期一 甲辰(龙)年 月廿三 设为首页 加入收藏
rss
您当前的位置:首页 > 计算机 > 编程开发 > Python

爬虫--Scrapy(实战多item多表存储)

时间:08-16来源:作者:点击数:27

爬虫--Scrapy(实战多item多表存储)

  • #settings.py
  • BOT_NAME = 'xpc'
  • SPIDER_MODULES = ['xpc.spiders']
  • NEWSPIDER_MODULE = 'xpc.spiders'
  • USER_AGENT = 'xpc (+http://www.yourdomain.com)'
  • ROBOTSTXT_OBEY = False
  • DOWNLOAD_DELAY = 1
  • ITEM_PIPELINES = {
  • 'xpc.pipelines.XpcPipeline': 300,
  • }
  • #爬虫文件.py中
  • import os
  • import re
  • import scrapy
  • from xpc.items import *
  • class MyxpcSpider(scrapy.Spider):
  • name = 'myxpc'
  • allowed_domains = ['xinpianchang.com']
  • start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=navigator']
  • # 主页数据
  • def parse(self, response, **kwargs):
  • print('*' * 60)
  • print(len(response.text))
  • # 视频列表
  • li_list = response.xpath('//div[@class="channel-con"]/ul[@class="video-list"]/li')
  • for li in li_list:
  • # 作品id
  • pid = li.xpath('./@data-articleid').get()
  • # 作品标题
  • title = li.xpath('./div[@class="video-con"]/div[@class="video-con-top"]/a/p/text()').get()
  • # 视频缩略图
  • thumbnail = li.xpath('./a/img/@_src').get()
  • # 作品类型
  • category = li.xpath('./div[@class="video-con"]/div[@class="video-con-top"]/div[@class="new-cate"]/span[@class="fs_12 fw_300 c_b_9"]/text()').get()
  • # 发表时间
  • created_at = li.xpath('./a/div[@class="video-hover-con"]/p/text()').get()
  • # 作品描述
  • description = li.xpath('./a/div[@class="video-hover-con"]/div/text()').get()
  • # 被点赞次数
  • like_counts = li.xpath('./div[@class="video-con"]//span[@class="fw_300 c_b_9 icon-like"]/text()').get()
  • # print(pid, title, thumbnail, category, created_at, description, like_counts)
  • # item
  • post_item = PostItem()
  • post_item['pid'] = pid
  • post_item['title'] = title
  • post_item['thumbnail'] = thumbnail
  • post_item['category'] = category
  • post_item['created_at'] = created_at
  • post_item['description'] = description
  • post_item['like_counts'] = like_counts
  • # 视频详情页
  • post_url = 'https://www.xinpianchang.com/a%s?from=ArticleList' % pid
  • # 请求详情页数据
  • request = scrapy.Request(url=post_url, callback=self.post_detail)
  • request.meta['post_item'] = post_item
  • yield request
  • # 视频详情页
  • def post_detail(self, response):
  • post_item = response.meta.get('post_item')
  • pid = post_item['pid']
  • # 播放次数
  • play_counts = response.xpath('//i[@class="fs_12 fw_300 c_b_6 v-center play-counts"]/@data-curplaycounts').get()
  • post_item['play_counts'] = play_counts
  • # 视频
  • # vid
  • vid = re.findall('vid = "(.*?)"', response.text)[0]
  • # print(vid)
  • # 视频url
  • video_url = 'https://mod-api.xinpianchang.com/mod/api/v2/media/%s?appKey=61a2f329348b3bf77' % vid
  • # print(video_url)
  • # 请求视频数据
  • request = scrapy.Request(url=video_url, callback=self.vido_detail)
  • request.meta['post_item'] = post_item
  • yield request
  • # 获取创作者的数据
  • li_list = response.xpath('//div[@class="filmplay-creator right-section"]/ul[@class="creator-list"]/li')
  • for li in li_list:
  • cid = li.xpath('./a/@data-userid').get() # 作者id
  • # item
  • composer_item = ComposersItem()
  • composer_item['cid'] = cid
  • # 进入到创作者的个人主页去获取其他数据
  • href = li.xpath('./a/@href').get()
  • href = "https://www.xinpianchang.com/" + href
  • request2 = scrapy.Request(url=href, callback=self.composer_detail)
  • request2.meta['composer_item'] = composer_item
  • yield request2
  • # 版权copyright
  • copyrights_item = CopyrightsItem()
  • copyrights_item['pcid'] = f'{pid}_{cid}'
  • copyrights_item['pid'] = pid
  • copyrights_item['cid'] = cid
  • roles = li.xpath('.//span[@class="roles fs_12 fw_300 c_b_9"]/text()').get()
  • copyrights_item['roles'] = roles
  • yield copyrights_item
  • # 评论数据
  • comment_url = 'https://app.xinpianchang.com/comments?resource_id=%s&type=article&page=1&per_page=24' % pid
  • request3 = scrapy.Request(url=comment_url, callback=self.comment_detail)
  • yield request3
  • # 视频数据
  • def vido_detail(self, response):
  • post_item = response.meta.get('post_item')
  • # json解析
  • content = response.json()
  • # 播放时长
  • duration = content['data']['duration']
  • # 视频预览图
  • preview = content['data']['cover']
  • # 视频链接
  • video = content['data']['resource']['progressive'][0]['url']
  • # 视频格式
  • video_format = content['data']['resource']['progressive'][0]['mime']
  • # item
  • post_item['duration'] = duration
  • post_item['preview'] = preview
  • post_item['video'] = video
  • post_item['video_format'] = video_format
  • yield post_item
  • # 创作者数据
  • def composer_detail(self, response):
  • composer_item = response.meta['composer_item']
  • # banner图
  • banner_style = response.xpath('//div[@class="banner-wrap"]/@style').get()
  • # print(banner_style)
  • # background-image:url(https://cs.xinpianchang.com/app/user/html/stars/css/skin46/bg-big.jpg)
  • banner = banner_style[banner_style.find("https"): -1]
  • # print(banner)
  • # 头像
  • avatar = response.xpath('//span[@class="avator-wrap-s"]/img/@src').get()
  • # 是否加V
  • verified = response.xpath('//span[@class="avator-wrap-s"]/span[contains(@class, "author-v")]').get()
  • # print('verified: ', verified)
  • verified = 'yes' if verified else 'no'
  • # 名字
  • name = response.xpath('//p[@class="creator-name fs_26 fw_600 c_b_26"]/text()').get()
  • # print(name)
  • # 自我介绍
  • intro = response.xpath('//p[@class="creator-desc fs_14 fw_300 c_b_3 line-hide-1"]/text()').get()
  • # 被点赞次数
  • like_counts = response.xpath('//span[@class="like-counts fw_600 v-center"]/text()').get()
  • like_counts = like_counts.replace(',', "")
  • # 被关注数量
  • fans_counts = response.xpath('//span[@class="fans-counts fw_600 v-center"]/text()').get()
  • fans_counts = fans_counts.replace(',', "")
  • # 关注数量
  • follow_counts = response.xpath('//span[@class="fw_600 v-center"]/text()').get()
  • follow_counts = follow_counts.replace(',', "")
  • # 所在位置
  • location = response.xpath('//p[@class="creator-detail fs_14 fw_300 c_b_9"]/span[@class="v-center"][1]/text()').get()
  • # 职业
  • career = response.xpath('//p[@class="creator-detail fs_14 fw_300 c_b_9"]/span[@class="v-center"][2]/text()').get()
  • composer_item['banner'] = banner
  • composer_item['avatar'] = avatar
  • composer_item['verified'] = verified
  • composer_item['name'] = name
  • composer_item['intro'] = intro
  • composer_item['like_counts'] = like_counts
  • composer_item['fans_counts'] = fans_counts
  • composer_item['follow_counts'] = follow_counts
  • composer_item['location'] = location
  • composer_item['career'] = career
  • yield composer_item
  • # print(composer_item)
  • # 评论数据
  • def comment_detail(self, response):
  • # 获取评论数据
  • content = response.json()
  • comment_list = content['data']['list']
  • for comment in comment_list:
  • item = CommentsItem()
  • item['commentid'] = comment['id']
  • item['pid'] = comment['resource_id']
  • item['cid'] = comment['userid']
  • item['avatar'] = comment['userInfo']['avatar']
  • item['uname'] = comment['userInfo']['username']
  • item['created_at'] = comment['addtime']
  • item['content'] = comment['content']
  • item['like_counts'] = comment['count_approve']
  • if comment.get('referer'):
  • item['reply'] = comment['referer']['id']
  • else:
  • item['reply'] = '0'
  • yield item
  • # print(item)
  • #items.py中
  • from scrapy import Item, Field
  • # 作品(短视频)
  • class PostItem(Item):
  • table_name = 'posts'
  • pid = Field() # 作品表主键
  • title = Field() # 作品标题
  • thumbnail = Field() # 视频缩略图
  • category = Field() # 作品分类
  • created_at = Field() # 发表时间
  • description = Field() # 作品描述
  • like_counts = Field() # 被点赞次数
  • duration = Field() # 播放时长
  • preview = Field() # 视频预览图
  • video = Field() # 视频链接
  • video_format = Field() # 视频格式
  • play_counts = Field() # 播放次数
  • # 作者
  • class ComposersItem(Item):
  • table_name = 'composers'
  • cid = Field() # 创作者表主键
  • banner = Field() # 用户主页banner图片
  • avatar = Field() # 用户头像
  • verified = Field() # 是否加V
  • name = Field() # 名字
  • intro = Field() # 自我介绍
  • like_counts = Field() # 被点赞次数
  • fans_counts = Field() # 被关注数量
  • follow_counts = Field() # 关注数量
  • location = Field() # 所在位置
  • career = Field() # 职业
  • # 评论
  • class CommentsItem(Item):
  • table_name = 'comments'
  • commentid = Field() # 评论表主键
  • pid = Field() # 评论的作品ID
  • cid = Field() # 评论人ID
  • avatar = Field() # 评论人头像
  • uname = Field() # 评论人名称
  • created_at = Field() # 发表时间
  • content = Field() # 评论内容
  • like_counts = Field() # 被点赞次数
  • reply = Field() # 回复其他评论的ID,如果不是则为0
  • # 版权
  • class CopyrightsItem(Item):
  • table_name = 'copyrights'
  • pcid = Field() # 主键,由pid_cid组成
  • pid = Field() # 对应作品表主键
  • cid = Field() # 对应作者表主键
  • roles = Field() # 担任角色
  • #pipelines.py中(核心)
  • import os
  • from itemadapter import ItemAdapter
  • import pymysql
  • from urllib import request
  • class XpcPipeline:
  • # 开启爬虫时: 连接MySQL
  • def open_spider(self, spider):
  • self.db = pymysql.connect(
  • host='localhost', port=3306,
  • user='root', password='root',
  • database='xpc_2020', charset='utf8mb4'
  • )
  • self.cursor = self.db.cursor()
  • # 关闭爬虫时: 关闭连接MySQL
  • def close_spider(self, spider):
  • self.cursor.close()
  • self.db.close()
  • # 处理数据
  • def process_item(self, item, spider):
  • table_name = item.table_name
  • keys = list(item.keys()) # ['pcid', 'pid', 'cid', 'roles']
  • values = list(item.values()) # ['333', '222', '111', '制作方']
  • # 所有字段组成的字符串
  • key_str = ','.join(['`%s`'%k for k in keys])
  • # print(key_str) # "`pcid`,`pid`,`cid`,`roles`"
  • # 值组成的字符串
  • values_str = ','.join(["%s"] * len(values))
  • print(values_str) # "%s,%s,%s,%s"
  • # update字符串
  • update_str = ','.join(["`{}`=%s".format(k) for k in keys])
  • print(update_str) # "`pcid`=%s,`pid`=%s,`cid`=%s,`roles`=%s"
  • # SQL
  • sql = 'insert into `{}`({}) values({}) on duplicate key update {}'.format(
  • table_name,
  • key_str,
  • values_str,
  • update_str
  • )
  • # print(sql)
  • # insert into `copyrights`(`pcid`,`pid`,`cid`,`roles`)
  • # values(%s,%s,%s,%s)
  • # on duplicate key update `pcid`=%s,`pid`=%s,`cid`=%s,`roles`=%s
  • # 执行SQL
  • self.cursor.execute(sql, values*2)
  • # values * 2 =
  • # ['333', '222', '111', '制作方', '333', '222', '111', '制作方']
  • self.db.commit()
  • print(f'----- 插入成功: {table_name} -----')
  • # on duplicate key update
  • # 作用: 当向数据库插入新记录时:
  • # 1. 如果表中的主键或唯一约束对应的值已经存在,则执行更新操作
  • # 2. 否则就插入新记录,不更新
  • return item
方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门
本栏推荐