2025年4月5日 星期六 乙巳(蛇)年 正月初六 夜 设为首页 加入收藏
rss
您当前的位置:首页 > 计算机 > 编程开发 > Python

Scrapy爬取视频

时间:07-03来源:作者:点击数:29

利用FilesPipeline 下载视频

1.setting.py

  • # 保存log信息的文件名
  • LOG_LEVEL = "INFO"
  • # LOG_STDOUT = True
  • # LOG_ENCODING = 'utf-8'
  • # # 路径 os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
  • # LOG_FILE = "info.log"
  • # 下载延迟
  • import random
  • DOWNLOAD_DELAY = random.random() + random.random()
  • RANDOMIZE_DOWNLOAD_DELAY = True
  • # Crawl responsibly by identifying yourself (and your website) on the user-agent
  • USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
  • # 视频下载路径
  • import os
  • BASE_DIR = os.path.dirname((os.path.abspath(__file__)))
  • MEDIA_ALLOW_REDIRECTS = True
  • FILES_STORE = os.path.join(BASE_DIR, "videos") # 文件保存路径
  • FILES_URLS_FIELD = 'file_urls' # 这里对应着item.py文件中的字段
  • FILES_RESULT_FIELD = 'files' # 同样对应item.py文件中的字段
  • # 120 days of delay for files expiration
  • # FILES_EXPIRES = 120 # 设置文件失效时间
  • ITEM_PIPELINES = {
  • 'steam_video.pipelines.SteamVideoPipeline': 300,
  • 'steam_video.pipelines.SteamDownLoadPipeline': 100, # 视频下载的管道
  • }

2.spider.py

  • # -*- coding: utf-8 -*-
  • import os
  • import pymysql
  • import re
  • import scrapy
  • from steam_video.settings import FILES_STORE as FILS
  • from steam_video.settings import MYSQL_HOST, MYSQL_POST, MYSQL_DATABASE, MYSQL_PASSWORD, MYSQL_USER
  • class VideosSpider(scrapy.Spider):
  • name = 'videos'
  • allowed_domains = ['.com']
  • start_urls = ['https://www.baidu.com/']
  • custom_settings = {
  • # 'DOWNLOAD_TIMEOUT': 60000,
  • 'DOWNLOAD_MAXSIZE': 12406585060, # 视频最大大小
  • 'DOWNLOAD_WARNSIZE': 12406585060 # 视频下载最大提醒尺寸
  • }
  • def parse(self, response):
  • db = pymysql.connect(host=MYSQL_HOST, port=MYSQL_POST, database=MYSQL_DATABASE, user=MYSQL_USER,
  • password=MYSQL_PASSWORD,
  • charset='utf8', autocommit=True)
  • cursor = db.cursor()
  • cursor.execute(
  • 'SELECT id, appid, data_webm_source, data_webm_hd_source, data_mp4_source, data_mp4_hd_source, data_poster from yu_spider_app_video WHERE appid=730 and id=53') # 获取图片url
  • for appid in cursor.fetchall():
  • for i in range(2, 7):
  • item = {}
  • item['id'] = appid[0]
  • item['appid'] = appid[1]
  • item['file_url'] = appid[i] # 下载视频的url 前面要自己获取到 视频的url 可以自己爬
  • item['video_name'] = str(int(item['appid']) % 100) + '/' + str(item['appid']) + '/' + str(
  • item['appid']) + '_' + re.findall(r'/(\d+)/', appid[i])[0] + '_' + \
  • appid[i].split('/')[-1].split('?')[0] # 后面图片要命名的名称
  • print(item)
  • print('*' * 100)
  • file_path = FILS + '/' + item['video_name']
  • if not os.path.exists(file_path): # 判断视频是否存在
  • yield item

3.pipelines.py

  • # -*- coding: utf-8 -*-
  • from scrapy.pipelines.images import FilesPipeline # 导入文件下载类
  • import scrapy, os, hashlib
  • from scrapy.http import Request
  • from scrapy.utils.python import to_bytes
  • from steam_video.settings import FILES_STORE as FILS
  • class SteamVideoPipeline(object):
  • def process_item(self, item, spider):
  • return item
  • # 下载图片管道
  • class SteamDownLoadPipeline(FilesPipeline):
  • def get_media_requests(self, item, info):
  • return scrapy.Request(item['file_url'],
  • meta={'video_name': item.get('video_name', None)}) # 下载视频 video_name为视频名称
  • def file_path(self, request, response=None, info=None):
  • def _warn():
  • from scrapy.exceptions import ScrapyDeprecationWarning
  • import warnings
  • warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
  • 'file_path(request, response=None, info=None) instead',
  • category=ScrapyDeprecationWarning, stacklevel=1)
  • # check if called from file_key with url as first argument
  • if not isinstance(request, Request):
  • _warn()
  • url = request
  • else:
  • url = request.url
  • # detect if file_key() method has been overridden
  • if not hasattr(self.file_key, '_base'):
  • _warn()
  • return self.file_key(url)
  • media_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation
  • media_ext = os.path.splitext(url)[1] # change to request.url after deprecation
  • # 这里我们使用自定义的文件名,如果meta中没有video_name,就使用url的hash值作为文件名
  • file_path = FILS + '\\' + request.meta.get('video_name')
  • file = file_path.replace('\\{}'.format(file_path.split('\\')[-1]), '')
  • if not os.path.exists(file):
  • os.makedirs(file)
  • return '{}'.format(request.meta.get('video_name'))
  • # return 'full/%s.mp4' % (''.join(request.meta.get('video_name', media_guid).split(' ')))
  • def file_key(self, url): # 服务器部署需要家里这个代码
  • return self.file_path(url)
  • file_key._base = True
  • def item_completed(self, results, item, info):
  • print('******the results is********:',
  • results) # resulte [(True, {'url': 'https://media.st.dl.bscstorage.net/steam/apps/904/movie480.mp4?t=1569623096', 'path': '220_904_movie480.mp4', 'checksum': '0f22435cdfe2d605480fc5396160d3a5'})]
  • if results[0][0]: # 判断视频是否下载成功
  • print('下载成功:{}'.format(item))
  • else:
  • print('下载失败:{}'.format(item))
方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门