先创建一个爬虫项目(baidu.com)
#settings.py中
BOT_NAME = 'mybaidu'
SPIDER_MODULES = ['mybaidu.spiders']
NEWSPIDER_MODULE = 'mybaidu.spiders'
USER_AGENT = 'mybaidu (+http://www.yourdomain.com)'
ROBOTSTXT_OBEY = False
#爬虫文件中
import scrapy
class BaiduSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['baicu.com']
start_urls = ['http://www.baidu.com/']
def parse(self, response):
print(len(response.text))
#可以成功运行就行
middlewares.py中(有两个自带的类)
from scrapy import signals
class MybaiduSpiderMiddleware:
'''爬虫中间件'''
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
return None
def process_spider_output(self, response, result, spider):
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
pass
def process_start_requests(self, start_requests, spider):
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class MybaiduDownloaderMiddleware:
'''下载中间件'''
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
'''每次请求之前调用'''
return None
def process_response(self, request, response, spider):
'''每次响应之后调用'''
return response
def process_exception(self, request, exception, spider):
'''捕获异常'''
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
用下载器中间件更换访问随机User_Agent
下载器中间件是介于Scrapy的request/response处理的钩子框架,是用于全局修改 Scrapy 的 request
和 response 的一个轻量、底层的系统。
#settings.py中添加UA列表
USER_AGENTS = [
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
]
#在中间件middlewares.py中
import random
from faker import Faker
from scrapy import signals
from settings import USER_AGENTS
class UserAgentDownloadMiddleware:
'''自定义中间件'''
def process_request(self, request, spider):
'''每次发送请求之前调用'''
useragent = random.choice(USER_AGENTS)
request.headers["User-Agent"] = useragent
# f = Faker() #这个模块集成了UA,也可以使用
# useragent = f.user_agent()
# print(useragent)
request.headers["User-Agent"] = useragent
print(request.headers)
return None
#settings.py中
DOWNLOADER_MIDDLEWARES = {
'mybaidu.middlewares.MybaiduDownloaderMiddleware': 543,
'mybaidu.middlewares.UserAgentDownloadMiddleware':300, #自己定义的中间件
}
用下载器中间件更换访问随机代理IP
#settings.py中添加代理IP池
DOWNLOADER_MIDDLEWARES = {
'mybaidu.middlewares.MybaiduDownloaderMiddleware': 543,
'mybaidu.middlewares.UserAgentDownloadMiddleware':300,
'mybaidu.middlewares.RandomProxy':300,
}
PROXIES = [
{'ip_port': '111.8.60.9:8123'},
{'ip_port': '101.71.27.120:80'},
{'ip_port': '122.96.59.104:80'},
{'ip_port': '122.224.249.122:8088'},
]
#在middlewares.py中
class RandomProxy(object):
def process_request(self, request, spider):
proxy = random.choice(PROXIES)
request.meta['proxy'] = proxy #现在每次请求会使用代理IP
print(request.meta)
Scrapy中POST请求
新建scrapy项目
在主爬虫文件中
import scrapy
class MyfanyiSpider(scrapy.Spider):
name = 'myfanyi'
allowed_domains = ['fanyi.baidu.com']
def start_requests(self):
print('发送请求')
yield scrapy.FormRequest( #发送POST请求
url='http://fanyi.baidu.com/sug',
formdata={'kw':'你好'},
callback=self.parse_item
)
def parse_item(self, response):
print(response.text)
print(response.json().get('data')[0].get('v'))