先创建一个爬虫项目(baidu.com)
- #settings.py中
-
- BOT_NAME = 'mybaidu'
-
- SPIDER_MODULES = ['mybaidu.spiders']
- NEWSPIDER_MODULE = 'mybaidu.spiders'
- USER_AGENT = 'mybaidu (+http://www.yourdomain.com)'
- ROBOTSTXT_OBEY = False
-
-
-
-
-
- #爬虫文件中
- import scrapy
-
- class BaiduSpider(scrapy.Spider):
- name = 'baidu'
- allowed_domains = ['baicu.com']
- start_urls = ['http://www.baidu.com/']
-
- def parse(self, response):
- print(len(response.text))
-
-
- #可以成功运行就行
-
middlewares.py中(有两个自带的类)
- from scrapy import signals
-
- class MybaiduSpiderMiddleware:
- '''爬虫中间件'''
- @classmethod
- def from_crawler(cls, crawler):
- s = cls()
- crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
- return s
-
- def process_spider_input(self, response, spider):
- return None
-
- def process_spider_output(self, response, result, spider):
- for i in result:
- yield i
-
- def process_spider_exception(self, response, exception, spider):
- pass
-
- def process_start_requests(self, start_requests, spider):
- for r in start_requests:
- yield r
-
- def spider_opened(self, spider):
- spider.logger.info('Spider opened: %s' % spider.name)
-
-
-
-
-
- class MybaiduDownloaderMiddleware:
- '''下载中间件'''
-
- @classmethod
- def from_crawler(cls, crawler):
- s = cls()
- crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
- return s
-
- def process_request(self, request, spider):
- '''每次请求之前调用'''
- return None
-
- def process_response(self, request, response, spider):
- '''每次响应之后调用'''
- return response
-
- def process_exception(self, request, exception, spider):
- '''捕获异常'''
- pass
- def spider_opened(self, spider):
- spider.logger.info('Spider opened: %s' % spider.name)
-
用下载器中间件更换访问随机User_Agent
- 下载器中间件是介于Scrapy的request/response处理的钩子框架,是用于全局修改 Scrapy 的 request
- 和 response 的一个轻量、底层的系统。
-
- #settings.py中添加UA列表
-
- USER_AGENTS = [
- "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
- "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
- "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
- "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
- "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
- ]
-
-
-
-
- #在中间件middlewares.py中
-
- import random
- from faker import Faker
- from scrapy import signals
- from settings import USER_AGENTS
-
- class UserAgentDownloadMiddleware:
- '''自定义中间件'''
- def process_request(self, request, spider):
- '''每次发送请求之前调用'''
- useragent = random.choice(USER_AGENTS)
- request.headers["User-Agent"] = useragent
- # f = Faker() #这个模块集成了UA,也可以使用
- # useragent = f.user_agent()
- # print(useragent)
- request.headers["User-Agent"] = useragent
- print(request.headers)
- return None
-
-
-
-
- #settings.py中
- DOWNLOADER_MIDDLEWARES = {
- 'mybaidu.middlewares.MybaiduDownloaderMiddleware': 543,
- 'mybaidu.middlewares.UserAgentDownloadMiddleware':300, #自己定义的中间件
- }
-
用下载器中间件更换访问随机代理IP
- #settings.py中添加代理IP池
-
- DOWNLOADER_MIDDLEWARES = {
- 'mybaidu.middlewares.MybaiduDownloaderMiddleware': 543,
- 'mybaidu.middlewares.UserAgentDownloadMiddleware':300,
- 'mybaidu.middlewares.RandomProxy':300,
- }
-
-
-
- PROXIES = [
- {'ip_port': '111.8.60.9:8123'},
- {'ip_port': '101.71.27.120:80'},
- {'ip_port': '122.96.59.104:80'},
- {'ip_port': '122.224.249.122:8088'},
- ]
-
-
-
-
-
- #在middlewares.py中
- class RandomProxy(object):
- def process_request(self, request, spider):
- proxy = random.choice(PROXIES)
- request.meta['proxy'] = proxy #现在每次请求会使用代理IP
- print(request.meta)
-
Scrapy中POST请求
- 新建scrapy项目
- 在主爬虫文件中
- import scrapy
-
-
- class MyfanyiSpider(scrapy.Spider):
- name = 'myfanyi'
- allowed_domains = ['fanyi.baidu.com']
-
-
- def start_requests(self):
- print('发送请求')
- yield scrapy.FormRequest( #发送POST请求
- url='http://fanyi.baidu.com/sug',
- formdata={'kw':'你好'},
- callback=self.parse_item
- )
-
- def parse_item(self, response):
- print(response.text)
- print(response.json().get('data')[0].get('v'))
-