创建爬虫环境
scrapy startproject xpc
cd xpc
scrapy genspider xinpianchang.com
创建调试文件
主项目下新建start.py文件
from scrapy.cmdline import execute
execute('scrapy crawl myxpc --nolog'.split())
#execute('scrapy crawl myxpc'.split())
创建数据库
CREATE DATABASE IF NOT EXISTS `xpc_2020` charset='utf8';
USE `xpc_2020`;
CREATE TABLE IF NOT EXISTS `posts` (
`pid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '作品表主键',
`title` VARCHAR(256) NOT NULL COMMENT '作品标题',
`thumbnail` VARCHAR(512) COMMENT '视频缩略图',
`preview` VARCHAR(512) COMMENT '视频预览图',
`video` VARCHAR(512) COMMENT '视频链接',
`video_format` VARCHAR(32) COMMENT '视频格式:4K 等',
`category` VARCHAR(512) NOT NULL DEFAULT '' COMMENT '作品分类',
`duration` INT(11) NOT NULL DEFAULT 0 COMMENT '播放时长',
`created_at` VARCHAR(128) NOT NULL DEFAULT '' COMMENT '发表时间',
`description` text COMMENT '作品描述',
`play_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '播放次数',
`like_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '被点赞次数',
PRIMARY KEY (`pid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '作品表';
CREATE TABLE IF NOT EXISTS `composers` (
`cid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '创作者表主键',
`banner` VARCHAR(512) NOT NULL COMMENT '用户主页banner图片',
`avatar` VARCHAR(512) NOT NULL DEFAULT '' COMMENT '用户头像',
`verified` VARCHAR(128) COMMENT '是否加V',
`name` VARCHAR(128) NOT NULL COMMENT '名字',
`intro` TEXT COMMENT '自我介绍',
`like_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '被点赞次数',
`fans_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '被关注数量',
`follow_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '关注数量',
`location` VARCHAR(512) NOT NULL DEFAULT '' COMMENT '所在位置',
`career` VARCHAR(512) NOT NULL DEFAULT '' COMMENT '职业',
PRIMARY KEY (`cid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '用户表';
CREATE TABLE IF NOT EXISTS `comments` (
`commentid` int(11) NOT NULL COMMENT '评论表主键',
`pid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '评论的作品ID',
`cid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '评论人ID',
`avatar` VARCHAR(512) COMMENT '评论人头像',
`uname` VARCHAR(512) COMMENT '评论人名称',
`created_at` VARCHAR(128) NOT NULL DEFAULT '' COMMENT '发表时间',
`content` TEXT COMMENT '评论内容',
`like_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '被点赞次数',
`reply` INT(8) NOT NULL DEFAULT 0 COMMENT '回复其他评论的ID,如果不是则为0',
PRIMARY KEY (`commentid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '评论表';
CREATE TABLE IF NOT EXISTS `copyrights` (
`pcid` VARCHAR(32) NOT NULL COMMENT '主键,由pid_cid组成',
`pid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '对应作品表主键',
`cid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '对应作者表主键',
`roles` VARCHAR(32) COMMENT '担任角色',
PRIMARY KEY (`pcid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '著作权关系表';
CREATE TABLE IF NOT EXISTS `codes` (
`code_id` BIGINT UNSIGNED AUTO_INCREMENT COMMENT '主键',
`phone` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '手机号',
`code` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '验证码',
`created_at` datetime NOT NULL COMMENT '发送时间',
`ip` VARCHAR(32) NOT NULL DEFAULT '' COMMENT '请求发送验证码的IP',
PRIMARY KEY (`code_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '手机验证码表';
开始爬虫
#在myxpc.py中
import re
import scrapy
from xpc.items import PostItem
class MyxpcSpider(scrapy.Spider):
name = 'myxpc'
allowed_domains = ['xinpianchang.com']
start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=navigator']
# 主页数据
def parse(self, response, **kwargs):
print('*' * 60)
print(len(response.text))
# 视频列表
li_list = response.xpath('//div[@class="channel-con"]/ul[@class="video-list"]/li')
for li in li_list:
# 作品id
pid = li.xpath('./@data-articleid').get()
# 作品标题
title = li.xpath('./div[@class="video-con"]/div[@class="video-con-top"]/a/p/text()').get()
# 视频缩略图
thumbnail = li.xpath('./a/img/@_src').get()
# 作品类型
category = li.xpath('./div[@class="video-con"]/div[@class="video-con-top"]/div[@class="new-cate"]/span[@class="fs_12 fw_300 c_b_9"]/text()').get()
# 发表时间
created_at = li.xpath('./a/div[@class="video-hover-con"]/p/text()').get()
# 作品描述
description = li.xpath('./a/div[@class="video-hover-con"]/div/text()').get()
# 被点赞次数
like_counts = li.xpath('./div[@class="video-con"]//span[@class="fw_300 c_b_9 icon-like"]/text()').get()
# print(pid, title, thumbnail, category, created_at, description, like_counts)
# item
post_item = PostItem()
post_item['pid'] = pid
post_item['title'] = title
post_item['thumbnail'] = thumbnail
post_item['category'] = category
post_item['created_at'] = created_at
post_item['description'] = description
post_item['like_counts'] = like_counts
# 视频详情页
post_url = 'https://www.xinpianchang.com/a%s?from=ArticleList' % pid
# 请求详情页数据
request = scrapy.Request(url=post_url, callback=self.post_detail)
request.meta['post_item'] = post_item
yield request
# 视频详情页
def post_detail(self, response):
post_item = response.meta.get('post_item')
# 播放次数
play_counts = response.xpath('//i[@class="fs_12 fw_300 c_b_6 v-center play-counts"]/@data-curplaycounts').get()
post_item['play_counts'] = play_counts
# 视频
# vid
vid = re.findall('vid = "(.*?)"', response.text)[0]
# print(vid)
# 视频url
video_url = 'https://mod-api.xinpianchang.com/mod/api/v2/media/%s?appKey=61a2f329348b3bf77' % vid
# print(video_url)
# 请求视频数据
yield scrapy.Request(
url=video_url,callback=self.vido_detail,meta={'post_item':post_item}
)
# 视频数据
def vido_detail(self, response):
post_item = response.meta.get('post_item')
# json解析
content = response.json()
# 播放时长
duration = content['data']['duration']
# 视频预览图
preview = content['data']['cover']
# 视频链接
video = content['data']['resource']['progressive'][0]['url']
# 视频格式
video_format = content['data']['resource']['progressive'][0]['mime']
# 爬视频
# request = scrapy.Request(url=video, callback=self.mp4_detail)
# request.meta['pid'] = post_item['pid']
# yield request
# item
post_item['duration'] = duration
post_item['preview'] = preview
post_item['video'] = video
post_item['video_format'] = video_format
yield post_item
# 视频mp4下载
# def mp4_detail(self, response):
# pid = response.meta.get('pid')
# print("mp4_detail: ", response.body)
#
# # 存视频
# filename = os.path.join(r'C:\Users\ijeff\Desktop\扣丁八期\爬虫_Day12_xpc爬虫项目上\code\xpc\xpc_video', pid+'.mp4')
# with open(filename, 'wb') as fp:
# fp.write(response.body)
# fp.flush()
#
# print(f'存入视频{pid+".mp4"}成功')
#items.py中
from scrapy import Item, Field
class PostItem(Item):
pid = Field() # 作品表主键
title = Field() # 作品标题
thumbnail = Field() # 视频缩略图
category = Field() # 作品分类
created_at = Field() # 发表时间
description = Field() # 作品描述
like_counts = Field() # 被点赞次数
duration = Field() # 播放时长
preview = Field() # 视频预览图
video = Field() # 视频链接
video_format = Field() # 视频格式
play_counts = Field() # 播放次数
#pipelines.py中
import time
import pymysql
from itemadapter import ItemAdapter
class XpcPipeline:
# 开启爬虫时: 连接MySQL
def open_spider(self, spider):
self.db = pymysql.connect(
host='IP地址', port=3306,
user='root', password='数据库密码',
database='xpc_2020', charset='utf8'
)
self.cursor = self.db.cursor()
# 关闭爬虫时: 关闭连接MySQL
def close_spider(self, spider):
self.cursor.close()
self.db.close()
# 处理数据
def process_item(self, item, spider):
# 将视频数据(post_item)存入到数据库
print(item)
sql = f'insert into posts values (' \
f'"{item["pid"]}", ' \
f'"{item["title"]}", ' \
f'"{item["thumbnail"]}", ' \
f'"{item["preview"]}", ' \
f'"{item["video"]}", ' \
f'"{item["video_format"]}", ' \
f'"{item["category"]}", ' \
f'"{item["duration"]}", ' \
f'"{item["created_at"]}", ' \
f'"{item["description"]}", ' \
f'"{item["play_counts"]}", ' \
f'"{item["like_counts"]}")'
try:
self.cursor.execute(sql)
self.db.commit()
except Exception as e:
print(f'----- 插入失败: {item["title"]}, {e} -----')
else:
print('----- 插入成功: insert success! -----')
time.sleep(1)
return item
总结
爬虫重在找网站的规律,只能能找到固定规律,代码就那么几行。