通过xpath解析出图片src的属性值。
只需要将img的src的属性值进行解析,提交到管道, 管道就会对图片的src进行请求发送获取图片
spider文件
- class ImgSpider(scrapy.Spider):
- name = 'img'
- # allowed_domains = ['www.xxx.com']
- start_urls = ['http://sc.chinaz.com/tupian/']
-
- def parse(self, response):
- src_list = response.xpath('//div[@id="container"]/div')
- # print(src_list)
- for src_item in src_list:
- # 图片懒加载(当浏览器滑动到图片时src2属性变为src属性)
- # scrapy不会滑动到图片,所以使用src2属性(伪属性)
- src_content = src_item.xpath('./div/a/img/@src2').extract_first()
- print(src_content)
- item = ImgsproItem()
- item['src'] = src_content
-
- yield item
实现父类的3个方法
-get_media_requests
-file_path
-item_completed
pipeline文件:
- import scrapy
- from scrapy.pipelines.images import ImagesPipeline
-
- class ImgsproPipeline(object):
- item = None
- def process_item(self, item, spider):
- return item
-
-
- # ImagesPipeline用于文件下载管道类,下载过程支持异步和多线程
- class ImgPipeLine(ImagesPipeline):
- # 对item中的图片进行请求操作
- def get_media_requests(self, item, info):
-
- yield scrapy.Request(item['src'])
- # 定制图片的名称
- def file_path(self, request, response=None, info=None):
- imgName = request.url.split('/')[-1]
- return imgName
- #
- def item_completed(self, results, item, info):
- return item # 传递到下一个即将被执行的管道类
- IMAGES_STORE ='./img'
-
- 配置文件:
- #USER_AGENT = 'firstBlood (+http://www.yourdomain.com)'
- USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' # 伪装请求载体身份
- # Obey robots.txt rules
- # ROBOTSTXT_OBEY = True
- ROBOTSTXT_OBEY = False #可以忽略或者不遵守robots协议
- #只显示指定类型的日志信息
- LOG_LEVEL='ERROR'
-
- # 表示最终图片存储的目录
- IMAGES_STORE ='./imgs'
- # Configure maximum concurrent requests performed by Scrapy (default: 16)
- #CONCURRENT_REQUESTS = 32
-
-
- # Configure item pipelines
- # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
- ITEM_PIPELINES = {
- 'imgsPro.pipelines.ImgsproPipeline': 300,
- 'imgsPro.pipelines.ImgPipeLine': 200,
- }