Python爬取拉勾网招聘信息,解决“您操作太频繁,请稍后访问”
# -*- coding: utf-8 -*-
#!/usr/bin/env python
# 爬取拉勾网职位信息
import time,re,json
import requests
from urllib.parse import quote
from lxml import etree
class LG(object):
def __init__(self,city):
self.data_list = []
self.start_url = 'https://www.lagou.com/jobs/list_'
# self.start_url = 'https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB/p-city_213'
self.url = 'https://www.lagou.com/jobs/positionAjax.json'
default_headers= {
'Host': 'www.lagou.com',
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36",
}
self.query = dict(
city=city,
needAddtionalResult="false"
)
self.s = requests.Session()
self.get_response = requests.get(url=self.start_url, headers=default_headers, timeout=3)
def get_page(self,city,kd):
kd = quote(kd)
patt = re.compile(r'global.cityNumMap =(.*?);')
cityNumMap = patt.findall(self.get_response.text)[0]
# 城市对应的号码,供链接地址city_url使用
dic_cityNum = json.loads(cityNumMap)
cityNum = dic_cityNum[city]
city_url = f'https://www.lagou.com/jobs/list_{kd}/p-city_{cityNum}'
self.headers = {
# 'Host': 'www.lagou.com',
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36",
'Referer': city_url
}
# 目的为了获取cookie和后面的页数
response = self.s.get(url=city_url, headers=self.headers, timeout=3)
html = etree.HTML(response.text)
pages = html.xpath('//div[@class="page-number"]/span[last()]/text()')[0]
print(f"总共获取到{pages}页数据!")
return (int(pages),self.headers)
def main(self,kd,page):
if page == 1:
first = 'true'
else:
first = 'false'
self.data = dict(
first=first,
pn=page,
kd=kd
)
return self.data
def parse(self,data,headers,page):
cookies = self.s.cookies
response = self.s.post(url=self.url,params=self.query,data=data,headers=headers,cookies=cookies,timeout=3)
time.sleep(5)
content = json.loads(response.text)
# with open("lagou_"+str(page)+'.html',"w",encoding='utf-8')as f:
# f.write(response.text)
result = content['content']['positionResult']['result']
for i in result:
self.data_list.append(i['companyShortName'])
print("共获取到本页数据:%s条"%len(result))
print(result)
if __name__ == '__main__':
kd = input("请输入职位名称(默认python):")
if kd == '':
kd = 'python'
city = input("请输入城市(默认北京):")
if city == '':
city = '北京'
lg = LG(city)
# 先获取页面
pages,headers = lg.get_page(city,kd)
for page in range(1,pages+1):
print("******第%s页******"%page)
data = lg.main(kd,page)
lg.parse(data,headers,page)
print("展示公司名字:",lg.data_list)
虽然能爬取到数据,但是显示的数据和页面显示的不一样!