之前写的几个爬虫都只能爬取到少量的信息,这是由于一个ip频繁地访问网站,会被认定为非正常的爬虫从而被屏蔽,这时候就需要使用代理ip来访问网站了,具体方法就是在发送request时添加一个proxy参数。代理ip有收费的也有免费的,免费的通常不太稳定,或者根本无法使用。我们的目标是在https://www.xicidaili.com/nn/页面爬取免费代理ip,然后验证代理ip可否使用,最后把相关信息保存到数据库中,不过要经常更新。
以下为需要导入的模块
import pymysql
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
import threading
import time
通过观察页面,每一个代理ip的信息都位于tr标签中,详细信息位于td中
我们的思路是在设计线程中获取tr,然后写一个获取ip信息列表的方法,提取有效信息
def parse_msg(ip_list):
ip_list = []
for i in range(1,len(ip_list)):
tds = ip_list[i].find_all('td')
ip, port, typ = tds[1].text, int(tds[2].text), tds[5].text.lower()
ip_list.append({'ip': ip, 'port': port, 'typ': typ})
return ip_list
此时获取到的ip列表中的ip质量良莠不齐,我们还需要通过此ip访问网络以测试该ip是否可用
def check_ip(ip, proxies_list):
try:
proxy = get_headers_proxy(ip)
url = 'https://www.ipip.net/'
r = requests.get(url, headers={'User-Agent':str(UserAgent().random)}, proxies=proxy, timeout=5)
r.raise_for_status()
except Exception as e:
pass
else:
proxies_list.append(ip)
其中get_headers_proxy方法获取了消息头中代理的标准写法
def get_headers_proxy(dic):
s = dic['typ'] + '://' + dic['ip'] + ':' + str(dic['port'])
return {'http': s, 'https': s}
然后将这些可用的ip信息存入到数据库中
def save_mysql(ip_list):
conn = pymysql.connect(host='localhost', user='root', passwd='root', db='python', charset="utf8")
cursor = conn.cursor()
cursor.execute('SET NAMES utf8;')
cursor.execute('SET CHARACTER SET utf8;')
cursor.execute('SET character_set_connection=utf8;')
for i in range(len(ip_list)):
query = """insert into proxy_ip(ip,port,typ)values(%s,%s,%s)"""
ip = ip_list[i]['ip']
port = ip_list[i]['port']
typ = ip_list[i]['typ']
values = (ip, port, typ)
cursor.execute(query, values)
cursor.close()
conn.commit()
conn.close()
接着是自定义的线程类
class GetThread(threading.Thread):
def __init__(self, args):
threading.Thread.__init__(self, args=args)
self.proxies_list = []
def run(self):
url = 'http://www.xicidaili.com/nn/%d' % self._args[0]
user_agent = UserAgent().random
headers = {'User-Agent': user_agent}
r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding
r.raise_for_status()
soup = BeautifulSoup(r.text, 'lxml')
ip_msg = soup.find_all('tr')[1:]
ip_list = parse_msg(ip_msg)
threads = []
for ip in ip_list:
t = threading.Thread(target=check_ip, args=[ip, self.proxies_list])
t.start()
time.sleep(0.1)
threads.append(t)
[t.join() for t in threads]
def get_proxies_list(self):
return self.proxies_list
import pymysql
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
import threading
import time
def get_headers_proxy(dic):
s = dic['typ'] + '://' + dic['ip'] + ':' + str(dic['port'])
return {'http': s, 'https': s}
def parse_msg(ip_list):
ip_list = []
for i in range(1,len(ip_list)):
tds = ip_list[i].find_all('td')
ip, port, typ = tds[1].text, int(tds[2].text), tds[5].text.lower()
ip_list.append({'ip': ip, 'port': port, 'typ': typ})
return ip_list
def check_ip(ip, proxies_list):
try:
proxy = get_headers_proxy(ip)
url = 'https://www.ipip.net/'
r = requests.get(url, headers={'User-Agent':str(UserAgent().random)}, proxies=proxy, timeout=5)
r.raise_for_status()
except Exception as e:
pass
else:
proxies_list.append(ip)
def save_mysql(ip_list):
conn = pymysql.connect(host='localhost', user='root', passwd='root', db='python', charset="utf8")
cursor = conn.cursor()
cursor.execute('SET NAMES utf8;')
cursor.execute('SET CHARACTER SET utf8;')
cursor.execute('SET character_set_connection=utf8;')
for i in range(len(ip_list)):
query = """insert into proxy_ip(ip,port,typ)values(%s,%s,%s)"""
ip = ip_list[i]['ip']
port = ip_list[i]['port']
typ = ip_list[i]['typ']
values = (ip, port, typ)
cursor.execute(query, values)
cursor.close()
conn.commit()
conn.close()
class GetThread(threading.Thread):
def __init__(self, args):
threading.Thread.__init__(self, args=args)
self.proxies_list = []
def run(self):
url = 'http://www.xicidaili.com/nn/%d' % self._args[0]
user_agent = UserAgent().random
headers = {'User-Agent': user_agent}
r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding
r.raise_for_status()
soup = BeautifulSoup(r.text, 'lxml')
ip_msg = soup.find_all('tr')[1:]
ip_list = parse_msg(ip_msg)
threads = []
for ip in ip_list:
t = threading.Thread(target=check_ip, args=[ip, self.proxies_list])
t.start()
time.sleep(0.1)
threads.append(t)
[t.join() for t in threads]
def get_proxies_list(self):
return self.proxies_list
if __name__ == '__main__':
threads = []
for i in range(1, 50):
t = GetThread(args=[i])
t.start()
time.sleep(3)
threads.append(t)
[t.join() for t in threads]
for t in threads:
proxies_list = t.get_proxies_list()
save_mysql(proxies_list)
运行成果
ps:实测后果然免费的太不稳定了,还是得花钱买