2025年4月15日 星期二 乙巳(蛇)年 正月十六 设为首页 加入收藏
rss
您当前的位置:首页 > 计算机 > 编程开发 > Python

利用scrapy简单爬取新片场前20页视频数据,并存入mysql数据库

时间:12-06来源:作者:点击数:48

1、创建Scrapy项目

  • scrapy startproject XPC

2.进入项目目录,使用命令genspider创建Spider(注意后面允许爬取的域要增加)

  • scrapy genspider xpc xinpianchang.com

3、定义要抓取的数据(处理items.py文件)

  • # -*- coding: utf-8 -*-
  • import scrapy
  • class XpcItem(scrapy.Item):
  • # 视频id
  • v_id = scrapy.Field()
  • # 视频名字
  • video_name = scrapy.Field()
  • # 视频分类
  • category = scrapy.Field()
  • # 上传时间
  • up_time = scrapy.Field()
  • # 播放量
  • play_counts = scrapy.Field()
  • # 点赞量
  • like_counts = scrapy.Field()
  • # 视频链接地址
  • video_url = scrapy.Field()
  • # 视频介绍
  • video_info = scrapy.Field()
  • # json文件地址,这个页面可以查看到视频的播放地址video_url
  • json_url = scrapy.Field()
  • # 视频详情页地址
  • video_detail_url = scrapy.Field()

4、编写提取item数据的Spider(在spiders文件夹下:xpc.py)

  • # -*- coding: utf-8 -*-
  • # 获取新片场作品
  • import re
  • import datetime
  • import scrapy
  • from ..items import XpcItem
  • import json
  • class XpcSpider(scrapy.Spider):
  • name = 'xpc'
  • allowed_domains = ['xinpianchang.com','openapi-vtom.vmovier.com']
  • start_urls = ['https://www.xinpianchang.com/channel/index/type-/sort-like/duration_type-0/resolution_type-/page-1']
  • def parse(self, response):
  • # 获取视频id,每页40条
  • video_id = response.xpath('//div[@class="channel-con"]/ul[@class="video-list"]/li/@data-articleid').extract()
  • for id in video_id:
  • # 视频详情页地址
  • video_detail_url = 'https://www.xinpianchang.com/a{}'.format(id)
  • yield scrapy.Request(url=video_detail_url,meta={'meta_1':video_detail_url},callback=self.video_detail)
  • # 非登录状态只能获取20页
  • total_page = 20
  • for page in range(2,total_page+1):
  • print("处理第%s页..."%page)
  • url = 'https://www.xinpianchang.com/channel/index/type-/sort-like/duration_type-0/resolution_type-/page-'
  • yield scrapy.Request(url=url+str(page),callback=self.parse)
  • # 视频详情页
  • def video_detail(self,response):
  • # 在spider运行到某个位置时暂停,查看被处理的response等情况
  • # from scrapy.shell import inspect_response
  • # inspect_response(response, self)
  • meta_1 = response.meta['meta_1']
  • # with open(meta_1.split('a')[-1] + ".html",'w',encoding='utf-8')as f:
  • # f.write(response.text)
  • item = XpcItem()
  • # 视频详情页面
  • item['video_detail_url'] = meta_1
  • item['v_id'] = meta_1.split('a')[-1]
  • # 视频名字
  • video_name = response.xpath('//div[@class="title-wrap"]/h3/text()').extract_first()
  • item['video_name'] = video_name.strip()
  • # 视频分类
  • # category = response.xpath('//span/span[contains(@class,"cate")]//text()').extract()
  • # item['category'] = "".join([s.strip() for s in category])
  • # 视频分类可能有多个,先判断有几个分类,取奇数个,偶数个是个|符号
  • category_count = len(response.xpath("//span[contains(@class,'cate-box')]/span/a[1]"))
  • if category_count >1:
  • category_list = []
  • for i in range(1,category_count+1):
  • c = response.xpath("//span[contains(@class,'cate-box')]/span["+str(2*i-1)+"]/a/text()").extract()
  • category_list.append("-".join([s.strip() for s in c]))
  • item['category'] = ",".join(category_list)
  • else:
  • category = response.xpath('//span/span[contains(@class,"cate")]//text()').extract()
  • item['category'] = "".join([s.strip() for s in category])
  • # 视频上传时间,时间会显示昨天不知道几号要转换
  • up_time = response.xpath('//div/span[contains(@class,"update-time")]/i/text()').get()
  • today = datetime.datetime.today()
  • if '昨天' in up_time:
  • yes = today - datetime.timedelta(days=1)
  • up_time = up_time.replace('昨天', yes.strftime("%Y-%m-%d"))
  • elif '今天' in up_time:
  • up_time = up_time.replace('今天', today.strftime("%Y-%m-%d"))
  • item['up_time'] = up_time
  • # 播放量
  • play_counts = response.xpath('//div/i[contains(@class,"play-counts")]/@data-curplaycounts').get()
  • item['play_counts'] = play_counts
  • # 喜欢量,点赞量
  • like_counts = response.xpath('//span/span[contains(@class,"like-counts")]/@data-counts').get()
  • item['like_counts'] = like_counts
  • # 视频连接地址
  • # video_url = response.xpath('//*[@id="xpc_video"]/source/@src').extract_first()
  • # item['video_url'] = video_url.strip()
  • # 视频介绍
  • video_info= response.xpath('//div[@class="filmplay-info"]/div/p[1]/text()').extract()
  • video_info = [s.strip() for s in video_info]
  • item['video_info']= ','.join(video_info)
  • # data-vid是json文件地址的一部分:960VAm7OGE7DRnW8
  • # https://openapi-vtom.vmovier.com/v3/video/960VAm7OGE7DRnW8?expand=resource&usage=xpc_web&appKey=61a2f329348b3bf77
  • # ①通过xpath获取data_vid
  • # data_vid = response.xpath('//div[@class="filmplay-data"]/div/span/a/@data-vid').extract_first()
  • # ②通过正则获取data_vid
  • patt_vid = re.compile(r'vid = "(\w+)";')
  • data_vid = patt_vid.findall(response.text)[0]
  • # modeServerAppKey=61a2f329348b3bf77这个值不知道会不会变
  • patt_modeServerAppKey = re.compile(r'modeServerAppKey = "(\w+)";')
  • data_modeServerAppKey = patt_modeServerAppKey.findall(response.text)[0]
  • # json文件地址,这个页面可以查看到视频的播放地址video_url
  • json_url = 'https://openapi-vtom.vmovier.com/v3/video/{}?expand=resource&usage=xpc_web&appKey={}'.format(data_vid,data_modeServerAppKey)
  • item['json_url'] = json_url
  • yield scrapy.Request(url=json_url,meta={'meta_2':item},callback=self.video_address)
  • # 视频地址
  • def video_address(self,respones):
  • item = XpcItem()
  • meta_2 = respones.meta['meta_2']
  • item['v_id'] = meta_2['v_id']
  • item['video_name'] = meta_2['video_name']
  • item['video_detail_url'] = meta_2['video_detail_url']
  • item['video_info'] = meta_2['video_info']
  • item['json_url'] = meta_2['json_url']
  • item['category'] = meta_2['category']
  • item['up_time'] = meta_2['up_time']
  • item['play_counts'] = meta_2['play_counts']
  • item['like_counts'] = meta_2['like_counts']
  • json_html = json.loads(respones.text)
  • # resource = {'default':'','progressive':'','lowest':''},这里面有不同的清晰度,要进行判断
  • resource = json_html['data']['resource']
  • if 'default' in resource.keys():
  • item['video_url'] = json_html['data']['resource']['default']['url']
  • elif 'progressive' in resource.keys():
  • item['video_url'] = json_html['data']['resource']['progressive'][0]['url']
  • else:
  • item['video_url'] = json_html['data']['resource']['lowest']['url']
  • yield item

5.处理pipelines管道文件保存数据,将结果保存到数据库中(pipelines.py)

  • # -*- coding: utf-8 -*-
  • import pymysql
  • class MySqlPipeline(object):
  • @classmethod
  • def from_crawler(cls,crawler):
  • cls.MYSQL_HOST = crawler.settings.get('MYSQL_HOST')
  • cls.MYSQL_PORT = crawler.settings.get('MYSQL_PORT')
  • cls.MYSQL_USER = crawler.settings.get('MYSQL_USER')
  • cls.MYSQL_PASSWD = crawler.settings.get('MYSQL_PASSWD')
  • cls.MYSQL_DBNAME = crawler.settings.get('MYSQL_DBNAME')
  • cls.MYSQL_CHARSET = crawler.settings.get('MYSQL_CHARSET')
  • return cls()
  • def __init__(self):
  • self.db = pymysql.connect(host=self.MYSQL_HOST,port=self.MYSQL_PORT,user=self.MYSQL_USER,passwd=self.MYSQL_PASSWD,
  • db=self.MYSQL_DBNAME,charset=self.MYSQL_CHARSET)
  • self.cursor = self.db.cursor()
  • def process_item(self,item,spider):
  • try:
  • # 尝试创建xpc表
  • # self.cursor.execute('DROP table IF EXISTS xpc')
  • sql = 'CREATE TABLE IF NOT EXISTS xpc(v_id BIGINT primary key not null COMMENT "视频页id",' \
  • 'video_name varchar(200),category varchar(100),up_time VARCHAR(50),play_counts INT(13),like_counts INT(13),' \
  • 'video_detail_url varchar(100),video_url varchar(200),video_info LONGTEXT,' \
  • 'json_url varchar(300))ENGINE =InnoDB DEFAULT CHARSET=utf8mb4;'
  • self.cursor.execute(sql)
  • except Exception as e:
  • print("xpc表已存在,无需创建!")
  • try:
  • # 去重处理
  • self.cursor.execute("SELECT v_id from xpc WHERE v_id=%s;",item['v_id'])
  • repetition = self.cursor.fetchone()
  • keys, values = zip(*item.items())
  • # 如果存在,则不重新插入,只更新
  • if repetition:
  • # ON DUPLICATE KEY UPDATE:数据已存在,只是更新部分字段值,否则插入重复key值数据会报错
  • sql = """
  • INSERT INTO xpc({})VALUES ({}) ON DUPLICATE KEY UPDATE {};""".format(
  • ','.join(keys),
  • ','.join(['%s']*len(values)),
  • ','.join(['{}=%s'.format(k) for k in keys]))
  • self.cursor.execute(sql,values*2)
  • else:
  • sql = """
  • INSERT INTO xpc({})VALUES ({});""".format(
  • ','.join(keys),
  • ','.join(['%s'] * len(values)))
  • self.cursor.execute(sql, values)
  • self.db.commit()
  • # print(self.cursor._last_executed)
  • return item
  • except Exception as e:
  • print("出错ERROR:",e)
  • self.db.rollback()
  • def close_spider(self,spider):
  • print("mysql数据库处理完毕")
  • self.cursor.close()
  • self.db.close()

6.配置settings文件(settings.py)

  • ROBOTSTXT_OBEY = False
  • # 配置数据库
  • MYSQL_HOST = 'localhost'
  • MYSQL_PORT = 3306
  • MYSQL_USER = 'root'
  • MYSQL_PASSWD = '123456'
  • MYSQL_DBNAME = 'python5'
  • MYSQL_CHARSET = 'utf8mb4'
  • DOWNLOAD_DELAY = 3
  • DEFAULT_REQUEST_HEADERS = {
  • 'User-Agesettingsnt': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',
  • # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  • # 'Accept-Language': 'en',
  • }
  • ITEM_PIPELINES = {
  • 'XPC.pipelines.MySqlPipeline': 300,
  • }
  • # 还可以将日志存到本地文件中(可选添加设置)
  • LOG_FILE = "xpc.log"
  • LOG_LEVEL = "DEBUG"
  • # 包含打印信息也一起写进日志里
  • LOG_STDOUT = True

7.以上设置完毕,进行爬取:执行项目命令crawl,启动Spider:

  • scrapy crawl xpc

 

方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门
本栏推荐