创建爬虫环境
- scrapy startproject xpc
- cd xpc
- scrapy genspider xinpianchang.com
-
创建调试文件
- 主项目下新建start.py文件
- from scrapy.cmdline import execute
-
- execute('scrapy crawl myxpc --nolog'.split())
- #execute('scrapy crawl myxpc'.split())
-
创建数据库
- CREATE DATABASE IF NOT EXISTS `xpc_2020` charset='utf8';
- USE `xpc_2020`;
-
- CREATE TABLE IF NOT EXISTS `posts` (
- `pid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '作品表主键',
- `title` VARCHAR(256) NOT NULL COMMENT '作品标题',
- `thumbnail` VARCHAR(512) COMMENT '视频缩略图',
- `preview` VARCHAR(512) COMMENT '视频预览图',
- `video` VARCHAR(512) COMMENT '视频链接',
- `video_format` VARCHAR(32) COMMENT '视频格式:4K 等',
- `category` VARCHAR(512) NOT NULL DEFAULT '' COMMENT '作品分类',
- `duration` INT(11) NOT NULL DEFAULT 0 COMMENT '播放时长',
- `created_at` VARCHAR(128) NOT NULL DEFAULT '' COMMENT '发表时间',
- `description` text COMMENT '作品描述',
- `play_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '播放次数',
- `like_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '被点赞次数',
- PRIMARY KEY (`pid`)
- ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '作品表';
-
- CREATE TABLE IF NOT EXISTS `composers` (
- `cid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '创作者表主键',
- `banner` VARCHAR(512) NOT NULL COMMENT '用户主页banner图片',
- `avatar` VARCHAR(512) NOT NULL DEFAULT '' COMMENT '用户头像',
- `verified` VARCHAR(128) COMMENT '是否加V',
- `name` VARCHAR(128) NOT NULL COMMENT '名字',
- `intro` TEXT COMMENT '自我介绍',
- `like_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '被点赞次数',
- `fans_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '被关注数量',
- `follow_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '关注数量',
- `location` VARCHAR(512) NOT NULL DEFAULT '' COMMENT '所在位置',
- `career` VARCHAR(512) NOT NULL DEFAULT '' COMMENT '职业',
- PRIMARY KEY (`cid`)
- ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '用户表';
-
-
- CREATE TABLE IF NOT EXISTS `comments` (
- `commentid` int(11) NOT NULL COMMENT '评论表主键',
- `pid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '评论的作品ID',
- `cid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '评论人ID',
- `avatar` VARCHAR(512) COMMENT '评论人头像',
- `uname` VARCHAR(512) COMMENT '评论人名称',
- `created_at` VARCHAR(128) NOT NULL DEFAULT '' COMMENT '发表时间',
- `content` TEXT COMMENT '评论内容',
- `like_counts` INT(8) NOT NULL DEFAULT 0 COMMENT '被点赞次数',
- `reply` INT(8) NOT NULL DEFAULT 0 COMMENT '回复其他评论的ID,如果不是则为0',
- PRIMARY KEY (`commentid`)
- ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '评论表';
-
-
- CREATE TABLE IF NOT EXISTS `copyrights` (
- `pcid` VARCHAR(32) NOT NULL COMMENT '主键,由pid_cid组成',
- `pid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '对应作品表主键',
- `cid` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '对应作者表主键',
- `roles` VARCHAR(32) COMMENT '担任角色',
- PRIMARY KEY (`pcid`)
- ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '著作权关系表';
-
-
- CREATE TABLE IF NOT EXISTS `codes` (
- `code_id` BIGINT UNSIGNED AUTO_INCREMENT COMMENT '主键',
- `phone` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '手机号',
- `code` BIGINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '验证码',
- `created_at` datetime NOT NULL COMMENT '发送时间',
- `ip` VARCHAR(32) NOT NULL DEFAULT '' COMMENT '请求发送验证码的IP',
- PRIMARY KEY (`code_id`)
- ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '手机验证码表';
-
开始爬虫
- #在myxpc.py中
- import re
- import scrapy
- from xpc.items import PostItem
-
- class MyxpcSpider(scrapy.Spider):
- name = 'myxpc'
- allowed_domains = ['xinpianchang.com']
- start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=navigator']
-
- # 主页数据
- def parse(self, response, **kwargs):
- print('*' * 60)
- print(len(response.text))
-
- # 视频列表
- li_list = response.xpath('//div[@class="channel-con"]/ul[@class="video-list"]/li')
-
- for li in li_list:
- # 作品id
- pid = li.xpath('./@data-articleid').get()
- # 作品标题
- title = li.xpath('./div[@class="video-con"]/div[@class="video-con-top"]/a/p/text()').get()
- # 视频缩略图
- thumbnail = li.xpath('./a/img/@_src').get()
- # 作品类型
- category = li.xpath('./div[@class="video-con"]/div[@class="video-con-top"]/div[@class="new-cate"]/span[@class="fs_12 fw_300 c_b_9"]/text()').get()
- # 发表时间
- created_at = li.xpath('./a/div[@class="video-hover-con"]/p/text()').get()
- # 作品描述
- description = li.xpath('./a/div[@class="video-hover-con"]/div/text()').get()
- # 被点赞次数
- like_counts = li.xpath('./div[@class="video-con"]//span[@class="fw_300 c_b_9 icon-like"]/text()').get()
-
- # print(pid, title, thumbnail, category, created_at, description, like_counts)
-
- # item
- post_item = PostItem()
- post_item['pid'] = pid
- post_item['title'] = title
- post_item['thumbnail'] = thumbnail
- post_item['category'] = category
-
- post_item['created_at'] = created_at
- post_item['description'] = description
- post_item['like_counts'] = like_counts
-
- # 视频详情页
- post_url = 'https://www.xinpianchang.com/a%s?from=ArticleList' % pid
- # 请求详情页数据
- request = scrapy.Request(url=post_url, callback=self.post_detail)
- request.meta['post_item'] = post_item
- yield request
-
- # 视频详情页
- def post_detail(self, response):
- post_item = response.meta.get('post_item')
-
- # 播放次数
- play_counts = response.xpath('//i[@class="fs_12 fw_300 c_b_6 v-center play-counts"]/@data-curplaycounts').get()
- post_item['play_counts'] = play_counts
-
- # 视频
- # vid
- vid = re.findall('vid = "(.*?)"', response.text)[0]
- # print(vid)
- # 视频url
- video_url = 'https://mod-api.xinpianchang.com/mod/api/v2/media/%s?appKey=61a2f329348b3bf77' % vid
- # print(video_url)
-
- # 请求视频数据
- yield scrapy.Request(
- url=video_url,callback=self.vido_detail,meta={'post_item':post_item}
- )
-
- # 视频数据
- def vido_detail(self, response):
- post_item = response.meta.get('post_item')
-
- # json解析
- content = response.json()
-
- # 播放时长
- duration = content['data']['duration']
- # 视频预览图
- preview = content['data']['cover']
- # 视频链接
- video = content['data']['resource']['progressive'][0]['url']
- # 视频格式
- video_format = content['data']['resource']['progressive'][0]['mime']
-
- # 爬视频
- # request = scrapy.Request(url=video, callback=self.mp4_detail)
- # request.meta['pid'] = post_item['pid']
- # yield request
-
-
- # item
- post_item['duration'] = duration
- post_item['preview'] = preview
- post_item['video'] = video
- post_item['video_format'] = video_format
- yield post_item
-
-
-
- # 视频mp4下载
- # def mp4_detail(self, response):
- # pid = response.meta.get('pid')
- # print("mp4_detail: ", response.body)
- #
- # # 存视频
- # filename = os.path.join(r'C:\Users\ijeff\Desktop\扣丁八期\爬虫_Day12_xpc爬虫项目上\code\xpc\xpc_video', pid+'.mp4')
- # with open(filename, 'wb') as fp:
- # fp.write(response.body)
- # fp.flush()
- #
- # print(f'存入视频{pid+".mp4"}成功')
-
-
-
-
-
-
- #items.py中
- from scrapy import Item, Field
-
- class PostItem(Item):
- pid = Field() # 作品表主键
- title = Field() # 作品标题
- thumbnail = Field() # 视频缩略图
- category = Field() # 作品分类
-
- created_at = Field() # 发表时间
- description = Field() # 作品描述
- like_counts = Field() # 被点赞次数
-
- duration = Field() # 播放时长
- preview = Field() # 视频预览图
- video = Field() # 视频链接
- video_format = Field() # 视频格式
- play_counts = Field() # 播放次数
-
-
-
-
-
- #pipelines.py中
- import time
- import pymysql
- from itemadapter import ItemAdapter
-
- class XpcPipeline:
- # 开启爬虫时: 连接MySQL
- def open_spider(self, spider):
- self.db = pymysql.connect(
- host='IP地址', port=3306,
- user='root', password='数据库密码',
- database='xpc_2020', charset='utf8'
- )
- self.cursor = self.db.cursor()
-
- # 关闭爬虫时: 关闭连接MySQL
- def close_spider(self, spider):
- self.cursor.close()
- self.db.close()
-
- # 处理数据
- def process_item(self, item, spider):
-
- # 将视频数据(post_item)存入到数据库
- print(item)
- sql = f'insert into posts values (' \
- f'"{item["pid"]}", ' \
- f'"{item["title"]}", ' \
- f'"{item["thumbnail"]}", ' \
- f'"{item["preview"]}", ' \
- f'"{item["video"]}", ' \
- f'"{item["video_format"]}", ' \
- f'"{item["category"]}", ' \
- f'"{item["duration"]}", ' \
- f'"{item["created_at"]}", ' \
- f'"{item["description"]}", ' \
- f'"{item["play_counts"]}", ' \
- f'"{item["like_counts"]}")'
-
- try:
- self.cursor.execute(sql)
- self.db.commit()
- except Exception as e:
- print(f'----- 插入失败: {item["title"]}, {e} -----')
- else:
- print('----- 插入成功: insert success! -----')
- time.sleep(1)
-
- return item
-
总结
爬虫重在找网站的规律,只能能找到固定规律,代码就那么几行。