scrapy 爬取图片
1.scrapy 有下载图片的自带接口,不用我们在去实现
setting.py设置
-
- LOG_LEVEL = "INFO"
-
-
-
-
-
-
- import random
- DOWNLOAD_DELAY = random.random() + random.random()
- RANDOMIZE_DOWNLOAD_DELAY = True
-
-
- USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
-
-
- import os
- BASE_DIR = os.path.dirname((os.path.abspath(__file__)))
- MEDIA_ALLOW_REDIRECTS = True
- IMAGES_STORE = os.path.join(BASE_DIR, "images")
-
-
- IMAGES_THUMBS = {
- 'small': (50, 50),
- 'big': (270, 270),
- }
-
- # 设置图文件过期时间 30天
- IMAGES_EXPIRES = 30
-
- IMAGES_MIN_WIDTH = 110
- IMAGES_MIN_HEIGHT = 110
-
- ITEM_PIPELINES = {
- 'steam_image.pipelines.SteamImagePipeline': 300,
- 'steam_image.pipelines.SteamDownLoadPipeline': 100,
- }
2.spider
-
- import scrapy, pymysql, copy
-
-
- class ImagesSpider(scrapy.Spider):
- name = 'images'
- allowed_domains = ['.com']
- start_urls = ['https://www.baidu.com/']
-
- def parse(self, response):
- db = pymysql.connect(host='localhost', port=3306, database='game', user='root', password='root',
- charset='utf8', autocommit=True)
- cursor = db.cursor()
- cursor.execute(
- 'SELECT id, appid, steam_image, steam_image_600_338 from steam_game_image WHERE id<5')
- for appid in cursor.fetchall():
- for i in range(2, 4):
- item = {}
- item['id'] = appid[0]
- item['appid'] = appid[1]
- item['image_url'] = appid[i]
- item['img_name'] = str(item['appid']) + '_' + appid[i].split('/')[-1].split('?')[0]
- yield item
-
3.pipelines.py
-
- from scrapy.pipelines.images import ImagesPipeline
- import scrapy, os
- from steam_image.settings import IMAGES_STORE as IMGS
-
-
- class SteamImagePipeline(object):
- def process_item(self, item, spider):
- return item
-
-
-
- class SteamDownLoadPipeline(ImagesPipeline):
- def get_media_requests(self, item, info):
- yield scrapy.Request(item['image_url'])
-
- def item_completed(self, results, item, info):
- print('******the results is********:', results)
-
-
- if results[0][0]:
- try:
- os.rename(IMGS + '\\' + results[0][1]['path'],
- IMGS + '\\' + item['img_name'])
- except Exception as e:
- print('错误类型:{}'.format(e))
-
- def close_spider(self, spider):
-
- os.removedirs(IMGS + '\\' + 'full')
-