之前写的几个爬虫都只能爬取到少量的信息,这是由于一个ip频繁地访问网站,会被认定为非正常的爬虫从而被屏蔽,这时候就需要使用代理ip来访问网站了,具体方法就是在发送request时添加一个proxy参数。代理ip有收费的也有免费的,免费的通常不太稳定,或者根本无法使用。我们的目标是在https://www.xicidaili.com/nn/页面爬取免费代理ip,然后验证代理ip可否使用,最后把相关信息保存到数据库中,不过要经常更新。
以下为需要导入的模块
- import pymysql
- from fake_useragent import UserAgent
- import requests
- from bs4 import BeautifulSoup
- import threading
- import time
通过观察页面,每一个代理ip的信息都位于tr标签中,详细信息位于td中
我们的思路是在设计线程中获取tr,然后写一个获取ip信息列表的方法,提取有效信息
- def parse_msg(ip_list):
- ip_list = []
- for i in range(1,len(ip_list)):
- tds = ip_list[i].find_all('td')
- ip, port, typ = tds[1].text, int(tds[2].text), tds[5].text.lower()
- ip_list.append({'ip': ip, 'port': port, 'typ': typ})
- return ip_list
-
此时获取到的ip列表中的ip质量良莠不齐,我们还需要通过此ip访问网络以测试该ip是否可用
- def check_ip(ip, proxies_list):
- try:
- proxy = get_headers_proxy(ip)
- url = 'https://www.ipip.net/'
- r = requests.get(url, headers={'User-Agent':str(UserAgent().random)}, proxies=proxy, timeout=5)
- r.raise_for_status()
- except Exception as e:
- pass
- else:
- proxies_list.append(ip)
其中get_headers_proxy方法获取了消息头中代理的标准写法
- def get_headers_proxy(dic):
- s = dic['typ'] + '://' + dic['ip'] + ':' + str(dic['port'])
- return {'http': s, 'https': s}
然后将这些可用的ip信息存入到数据库中
- def save_mysql(ip_list):
- conn = pymysql.connect(host='localhost', user='root', passwd='root', db='python', charset="utf8")
- cursor = conn.cursor()
- cursor.execute('SET NAMES utf8;')
- cursor.execute('SET CHARACTER SET utf8;')
- cursor.execute('SET character_set_connection=utf8;')
- for i in range(len(ip_list)):
- query = """insert into proxy_ip(ip,port,typ)values(%s,%s,%s)"""
- ip = ip_list[i]['ip']
- port = ip_list[i]['port']
- typ = ip_list[i]['typ']
- values = (ip, port, typ)
- cursor.execute(query, values)
- cursor.close()
- conn.commit()
- conn.close()
接着是自定义的线程类
- class GetThread(threading.Thread):
- def __init__(self, args):
- threading.Thread.__init__(self, args=args)
- self.proxies_list = []
-
- def run(self):
- url = 'http://www.xicidaili.com/nn/%d' % self._args[0]
- user_agent = UserAgent().random
- headers = {'User-Agent': user_agent}
- r = requests.get(url, headers=headers)
- r.encoding = r.apparent_encoding
- r.raise_for_status()
- soup = BeautifulSoup(r.text, 'lxml')
- ip_msg = soup.find_all('tr')[1:]
- ip_list = parse_msg(ip_msg)
- threads = []
- for ip in ip_list:
- t = threading.Thread(target=check_ip, args=[ip, self.proxies_list])
- t.start()
- time.sleep(0.1)
- threads.append(t)
- [t.join() for t in threads]
-
- def get_proxies_list(self):
- return self.proxies_list
- import pymysql
- from fake_useragent import UserAgent
- import requests
- from bs4 import BeautifulSoup
- import threading
- import time
-
- def get_headers_proxy(dic):
- s = dic['typ'] + '://' + dic['ip'] + ':' + str(dic['port'])
- return {'http': s, 'https': s}
-
-
- def parse_msg(ip_list):
- ip_list = []
- for i in range(1,len(ip_list)):
- tds = ip_list[i].find_all('td')
- ip, port, typ = tds[1].text, int(tds[2].text), tds[5].text.lower()
- ip_list.append({'ip': ip, 'port': port, 'typ': typ})
- return ip_list
-
-
- def check_ip(ip, proxies_list):
- try:
- proxy = get_headers_proxy(ip)
- url = 'https://www.ipip.net/'
- r = requests.get(url, headers={'User-Agent':str(UserAgent().random)}, proxies=proxy, timeout=5)
- r.raise_for_status()
- except Exception as e:
- pass
- else:
- proxies_list.append(ip)
-
- def save_mysql(ip_list):
- conn = pymysql.connect(host='localhost', user='root', passwd='root', db='python', charset="utf8")
- cursor = conn.cursor()
- cursor.execute('SET NAMES utf8;')
- cursor.execute('SET CHARACTER SET utf8;')
- cursor.execute('SET character_set_connection=utf8;')
- for i in range(len(ip_list)):
- query = """insert into proxy_ip(ip,port,typ)values(%s,%s,%s)"""
- ip = ip_list[i]['ip']
- port = ip_list[i]['port']
- typ = ip_list[i]['typ']
- values = (ip, port, typ)
- cursor.execute(query, values)
- cursor.close()
- conn.commit()
- conn.close()
-
-
-
- class GetThread(threading.Thread):
- def __init__(self, args):
- threading.Thread.__init__(self, args=args)
- self.proxies_list = []
-
- def run(self):
- url = 'http://www.xicidaili.com/nn/%d' % self._args[0]
- user_agent = UserAgent().random
- headers = {'User-Agent': user_agent}
- r = requests.get(url, headers=headers)
- r.encoding = r.apparent_encoding
- r.raise_for_status()
- soup = BeautifulSoup(r.text, 'lxml')
- ip_msg = soup.find_all('tr')[1:]
- ip_list = parse_msg(ip_msg)
- threads = []
- for ip in ip_list:
- t = threading.Thread(target=check_ip, args=[ip, self.proxies_list])
- t.start()
- time.sleep(0.1)
- threads.append(t)
- [t.join() for t in threads]
-
- def get_proxies_list(self):
- return self.proxies_list
-
-
- if __name__ == '__main__':
- threads = []
- for i in range(1, 50):
- t = GetThread(args=[i])
- t.start()
- time.sleep(3)
- threads.append(t)
- [t.join() for t in threads]
- for t in threads:
- proxies_list = t.get_proxies_list()
- save_mysql(proxies_list)
-
运行成果
ps:实测后果然免费的太不稳定了,还是得花钱买