利用FilesPipeline 下载视频
1.setting.py
- # 保存log信息的文件名
- LOG_LEVEL = "INFO"
- # LOG_STDOUT = True
- # LOG_ENCODING = 'utf-8'
- # # 路径 os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
- # LOG_FILE = "info.log"
-
- # 下载延迟
- import random
-
- DOWNLOAD_DELAY = random.random() + random.random()
- RANDOMIZE_DOWNLOAD_DELAY = True
-
- # Crawl responsibly by identifying yourself (and your website) on the user-agent
- USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
-
- # 视频下载路径
- import os
-
- BASE_DIR = os.path.dirname((os.path.abspath(__file__)))
- MEDIA_ALLOW_REDIRECTS = True
- FILES_STORE = os.path.join(BASE_DIR, "videos") # 文件保存路径
- FILES_URLS_FIELD = 'file_urls' # 这里对应着item.py文件中的字段
- FILES_RESULT_FIELD = 'files' # 同样对应item.py文件中的字段
- # 120 days of delay for files expiration
- # FILES_EXPIRES = 120 # 设置文件失效时间
-
-
- ITEM_PIPELINES = {
- 'steam_video.pipelines.SteamVideoPipeline': 300,
- 'steam_video.pipelines.SteamDownLoadPipeline': 100, # 视频下载的管道
- }
-
2.spider.py
-
- import os
- import pymysql
- import re
- import scrapy
-
- from steam_video.settings import FILES_STORE as FILS
- from steam_video.settings import MYSQL_HOST, MYSQL_POST, MYSQL_DATABASE, MYSQL_PASSWORD, MYSQL_USER
-
-
- class VideosSpider(scrapy.Spider):
- name = 'videos'
- allowed_domains = ['.com']
- start_urls = ['https://www.baidu.com/']
- custom_settings = {
-
- 'DOWNLOAD_MAXSIZE': 12406585060,
- 'DOWNLOAD_WARNSIZE': 12406585060
- }
-
- def parse(self, response):
- db = pymysql.connect(host=MYSQL_HOST, port=MYSQL_POST, database=MYSQL_DATABASE, user=MYSQL_USER,
- password=MYSQL_PASSWORD,
- charset='utf8', autocommit=True)
- cursor = db.cursor()
- cursor.execute(
- 'SELECT id, appid, data_webm_source, data_webm_hd_source, data_mp4_source, data_mp4_hd_source, data_poster from yu_spider_app_video WHERE appid=730 and id=53')
- for appid in cursor.fetchall():
- for i in range(2, 7):
- item = {}
- item['id'] = appid[0]
- item['appid'] = appid[1]
- item['file_url'] = appid[i]
- item['video_name'] = str(int(item['appid']) % 100) + '/' + str(item['appid']) + '/' + str(
- item['appid']) + '_' + re.findall(r'/(\d+)/', appid[i])[0] + '_' + \
- appid[i].split('/')[-1].split('?')[0]
- print(item)
- print('*' * 100)
- file_path = FILS + '/' + item['video_name']
- if not os.path.exists(file_path):
- yield item
3.pipelines.py
-
- from scrapy.pipelines.images import FilesPipeline
- import scrapy, os, hashlib
- from scrapy.http import Request
- from scrapy.utils.python import to_bytes
- from steam_video.settings import FILES_STORE as FILS
-
-
- class SteamVideoPipeline(object):
- def process_item(self, item, spider):
- return item
-
-
-
- class SteamDownLoadPipeline(FilesPipeline):
- def get_media_requests(self, item, info):
- return scrapy.Request(item['file_url'],
- meta={'video_name': item.get('video_name', None)})
-
- def file_path(self, request, response=None, info=None):
-
- def _warn():
- from scrapy.exceptions import ScrapyDeprecationWarning
- import warnings
- warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
- 'file_path(request, response=None, info=None) instead',
- category=ScrapyDeprecationWarning, stacklevel=1)
-
-
- if not isinstance(request, Request):
- _warn()
- url = request
- else:
- url = request.url
-
-
- if not hasattr(self.file_key, '_base'):
- _warn()
- return self.file_key(url)
-
- media_guid = hashlib.sha1(to_bytes(url)).hexdigest()
- media_ext = os.path.splitext(url)[1]
-
- file_path = FILS + '\\' + request.meta.get('video_name')
- file = file_path.replace('\\{}'.format(file_path.split('\\')[-1]), '')
- if not os.path.exists(file):
- os.makedirs(file)
- return '{}'.format(request.meta.get('video_name'))
-
-
- def file_key(self, url):
- return self.file_path(url)
- file_key._base = True
-
- def item_completed(self, results, item, info):
- print('******the results is********:',
- results)
- if results[0][0]:
- print('下载成功:{}'.format(item))
- else:
- print('下载失败:{}'.format(item))
-