2025年3月30日 星期日 甲辰(龙)年 月廿九 设为首页 加入收藏
rss
您当前的位置:首页 > 计算机 > 编程开发 > Python

爬虫--Scrapy实战爬取视频网站资源

时间:08-16来源:作者:点击数:31
CDSY,CDSY.XYZ

创建爬虫环境

  • scrapy startproject xpc
  • cd xpc
  • scrapy genspider xinpianchang.com

创建调试文件

  • 主项目下新建start.py文件
  • from scrapy.cmdline import execute
  • execute('scrapy crawl myxpc --nolog'.split())
  • #execute('scrapy crawl myxpc'.split())

创建数据库

  • CREATE DATABASE IF NOT EXISTS `xpc_2020` charset='utf8';
  • USE `xpc_2020`;
  • CREATE TABLE IF NOT EXISTS `posts` (
  • `pid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '作品表主键',
  • `title` VARCHAR(256) NOT NULL COMMENT '作品标题',
  • `thumbnail` VARCHAR(512) COMMENT '视频缩略图',
  • `preview` VARCHAR(512) COMMENT '视频预览图',
  • `video` VARCHAR(512) COMMENT '视频链接',
  • `video_format` VARCHAR(32) COMMENT '视频格式:4K 等',
  • `category` VARCHAR(512) NOT NULL DEFAULT '' COMMENT '作品分类',
  • `duration` INT(11) NOT NULL DEFAULT 0 COMMENT '播放时长',
  • `created_at` VARCHAR(128) NOT NULL DEFAULT '' COMMENT '发表时间',
  • `description` text COMMENT '作品描述',
  • `play_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '播放次数',
  • `like_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '被点赞次数',
  • PRIMARY KEY (`pid`)
  • ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '作品表';
  • CREATE TABLE IF NOT EXISTS `composers` (
  • `cid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '创作者表主键',
  • `banner` VARCHAR(512) NOT NULL COMMENT '用户主页banner图片',
  • `avatar` VARCHAR(512) NOT NULL DEFAULT '' COMMENT '用户头像',
  • `verified` VARCHAR(128) COMMENT '是否加V',
  • `name` VARCHAR(128) NOT NULL COMMENT '名字',
  • `intro` TEXT COMMENT '自我介绍',
  • `like_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '被点赞次数',
  • `fans_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '被关注数量',
  • `follow_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '关注数量',
  • `location` VARCHAR(512) NOT NULL DEFAULT '' COMMENT '所在位置',
  • `career` VARCHAR(512) NOT NULL DEFAULT '' COMMENT '职业',
  • PRIMARY KEY (`cid`)
  • ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '用户表';
  • CREATE TABLE IF NOT EXISTS `comments` (
  • `commentid` int(11) NOT NULL COMMENT '评论表主键',
  • `pid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '评论的作品ID',
  • `cid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '评论人ID',
  • `avatar` VARCHAR(512) COMMENT '评论人头像',
  • `uname` VARCHAR(512) COMMENT '评论人名称',
  • `created_at` VARCHAR(128) NOT NULL DEFAULT '' COMMENT '发表时间',
  • `content` TEXT COMMENT '评论内容',
  • `like_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '被点赞次数',
  • `reply` INT(8) NOT NULL DEFAULT 0 COMMENT '回复其他评论的ID,如果不是则为0',
  • PRIMARY KEY (`commentid`)
  • ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '评论表';
  • CREATE TABLE IF NOT EXISTS `copyrights` (
  • `pcid` VARCHAR(32) NOT NULL COMMENT '主键,由pid_cid组成',
  • `pid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '对应作品表主键',
  • `cid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '对应作者表主键',
  • `roles` VARCHAR(32) COMMENT '担任角色',
  • PRIMARY KEY (`pcid`)
  • ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '著作权关系表';
  • CREATE TABLE IF NOT EXISTS `codes` (
  • `code_id` BIGINT UNSIGNED AUTO_INCREMENT COMMENT '主键',
  • `phone` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '手机号',
  • `code` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '验证码',
  • `created_at` datetime NOT NULL COMMENT '发送时间',
  • `ip` VARCHAR(32) NOT NULL DEFAULT '' COMMENT '请求发送验证码的IP',
  • PRIMARY KEY (`code_id`)
  • ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '手机验证码表';

开始爬虫

  • #在myxpc.py中
  • import re
  • import scrapy
  • from xpc.items import PostItem
  • class MyxpcSpider(scrapy.Spider):
  • name = 'myxpc'
  • allowed_domains = ['xinpianchang.com']
  • start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=navigator']
  • # 主页数据
  • def parse(self, response, **kwargs):
  • print('*' * 60)
  • print(len(response.text))
  • # 视频列表
  • li_list = response.xpath('//div[@class="channel-con"]/ul[@class="video-list"]/li')
  • for li in li_list:
  • # 作品id
  • pid = li.xpath('./@data-articleid').get()
  • # 作品标题
  • title = li.xpath('./div[@class="video-con"]/div[@class="video-con-top"]/a/p/text()').get()
  • # 视频缩略图
  • thumbnail = li.xpath('./a/img/@_src').get()
  • # 作品类型
  • category = li.xpath('./div[@class="video-con"]/div[@class="video-con-top"]/div[@class="new-cate"]/span[@class="fs_12 fw_300 c_b_9"]/text()').get()
  • # 发表时间
  • created_at = li.xpath('./a/div[@class="video-hover-con"]/p/text()').get()
  • # 作品描述
  • description = li.xpath('./a/div[@class="video-hover-con"]/div/text()').get()
  • # 被点赞次数
  • like_counts = li.xpath('./div[@class="video-con"]//span[@class="fw_300 c_b_9 icon-like"]/text()').get()
  • # print(pid, title, thumbnail, category, created_at, description, like_counts)
  • # item
  • post_item = PostItem()
  • post_item['pid'] = pid
  • post_item['title'] = title
  • post_item['thumbnail'] = thumbnail
  • post_item['category'] = category
  • post_item['created_at'] = created_at
  • post_item['description'] = description
  • post_item['like_counts'] = like_counts
  • # 视频详情页
  • post_url = 'https://www.xinpianchang.com/a%s?from=ArticleList' % pid
  • # 请求详情页数据
  • request = scrapy.Request(url=post_url, callback=self.post_detail)
  • request.meta['post_item'] = post_item
  • yield request
  • # 视频详情页
  • def post_detail(self, response):
  • post_item = response.meta.get('post_item')
  • # 播放次数
  • play_counts = response.xpath('//i[@class="fs_12 fw_300 c_b_6 v-center play-counts"]/@data-curplaycounts').get()
  • post_item['play_counts'] = play_counts
  • # 视频
  • # vid
  • vid = re.findall('vid = "(.*?)"', response.text)[0]
  • # print(vid)
  • # 视频url
  • video_url = 'https://mod-api.xinpianchang.com/mod/api/v2/media/%s?appKey=61a2f329348b3bf77' % vid
  • # print(video_url)
  • # 请求视频数据
  • yield scrapy.Request(
  • url=video_url,callback=self.vido_detail,meta={'post_item':post_item}
  • )
  • # 视频数据
  • def vido_detail(self, response):
  • post_item = response.meta.get('post_item')
  • # json解析
  • content = response.json()
  • # 播放时长
  • duration = content['data']['duration']
  • # 视频预览图
  • preview = content['data']['cover']
  • # 视频链接
  • video = content['data']['resource']['progressive'][0]['url']
  • # 视频格式
  • video_format = content['data']['resource']['progressive'][0]['mime']
  • # 爬视频
  • # request = scrapy.Request(url=video, callback=self.mp4_detail)
  • # request.meta['pid'] = post_item['pid']
  • # yield request
  • # item
  • post_item['duration'] = duration
  • post_item['preview'] = preview
  • post_item['video'] = video
  • post_item['video_format'] = video_format
  • yield post_item
  • # 视频mp4下载
  • # def mp4_detail(self, response):
  • # pid = response.meta.get('pid')
  • # print("mp4_detail: ", response.body)
  • #
  • # # 存视频
  • # filename = os.path.join(r'C:\Users\ijeff\Desktop\扣丁八期\爬虫_Day12_xpc爬虫项目上\code\xpc\xpc_video', pid+'.mp4')
  • # with open(filename, 'wb') as fp:
  • # fp.write(response.body)
  • # fp.flush()
  • #
  • # print(f'存入视频{pid+".mp4"}成功')
  • #items.py中
  • from scrapy import Item, Field
  • class PostItem(Item):
  • pid = Field() # 作品表主键
  • title = Field() # 作品标题
  • thumbnail = Field() # 视频缩略图
  • category = Field() # 作品分类
  • created_at = Field() # 发表时间
  • description = Field() # 作品描述
  • like_counts = Field() # 被点赞次数
  • duration = Field() # 播放时长
  • preview = Field() # 视频预览图
  • video = Field() # 视频链接
  • video_format = Field() # 视频格式
  • play_counts = Field() # 播放次数
  • #pipelines.py中
  • import time
  • import pymysql
  • from itemadapter import ItemAdapter
  • class XpcPipeline:
  • # 开启爬虫时: 连接MySQL
  • def open_spider(self, spider):
  • self.db = pymysql.connect(
  • host='IP地址', port=3306,
  • user='root', password='数据库密码',
  • database='xpc_2020', charset='utf8'
  • )
  • self.cursor = self.db.cursor()
  • # 关闭爬虫时: 关闭连接MySQL
  • def close_spider(self, spider):
  • self.cursor.close()
  • self.db.close()
  • # 处理数据
  • def process_item(self, item, spider):
  • # 将视频数据(post_item)存入到数据库
  • print(item)
  • sql = f'insert into posts values (' \
  • f'"{item["pid"]}", ' \
  • f'"{item["title"]}", ' \
  • f'"{item["thumbnail"]}", ' \
  • f'"{item["preview"]}", ' \
  • f'"{item["video"]}", ' \
  • f'"{item["video_format"]}", ' \
  • f'"{item["category"]}", ' \
  • f'"{item["duration"]}", ' \
  • f'"{item["created_at"]}", ' \
  • f'"{item["description"]}", ' \
  • f'"{item["play_counts"]}", ' \
  • f'"{item["like_counts"]}")'
  • try:
  • self.cursor.execute(sql)
  • self.db.commit()
  • except Exception as e:
  • print(f'----- 插入失败: {item["title"]}, {e} -----')
  • else:
  • print('----- 插入成功: insert success! -----')
  • time.sleep(1)
  • return item

总结

爬虫重在找网站的规律,只能能找到固定规律,代码就那么几行。

CDSY,CDSY.XYZ
方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门
本栏推荐