Python爬取拉勾网招聘信息,解决“您操作太频繁,请稍后访问”
- # -*- coding: utf-8 -*-
- #!/usr/bin/env python
- # 爬取拉勾网职位信息
-
- import time,re,json
- import requests
- from urllib.parse import quote
- from lxml import etree
-
- class LG(object):
- def __init__(self,city):
- self.data_list = []
- self.start_url = 'https://www.lagou.com/jobs/list_'
- # self.start_url = 'https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB/p-city_213'
- self.url = 'https://www.lagou.com/jobs/positionAjax.json'
- default_headers= {
- 'Host': 'www.lagou.com',
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36",
- }
- self.query = dict(
- city=city,
- needAddtionalResult="false"
- )
- self.s = requests.Session()
- self.get_response = requests.get(url=self.start_url, headers=default_headers, timeout=3)
-
- def get_page(self,city,kd):
- kd = quote(kd)
- patt = re.compile(r'global.cityNumMap =(.*?);')
- cityNumMap = patt.findall(self.get_response.text)[0]
- # 城市对应的号码,供链接地址city_url使用
- dic_cityNum = json.loads(cityNumMap)
- cityNum = dic_cityNum[city]
- city_url = f'https://www.lagou.com/jobs/list_{kd}/p-city_{cityNum}'
- self.headers = {
- # 'Host': 'www.lagou.com',
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36",
- 'Referer': city_url
- }
- # 目的为了获取cookie和后面的页数
- response = self.s.get(url=city_url, headers=self.headers, timeout=3)
- html = etree.HTML(response.text)
- pages = html.xpath('//div[@class="page-number"]/span[last()]/text()')[0]
- print(f"总共获取到{pages}页数据!")
- return (int(pages),self.headers)
-
- def main(self,kd,page):
- if page == 1:
- first = 'true'
- else:
- first = 'false'
- self.data = dict(
- first=first,
- pn=page,
- kd=kd
- )
- return self.data
- def parse(self,data,headers,page):
- cookies = self.s.cookies
- response = self.s.post(url=self.url,params=self.query,data=data,headers=headers,cookies=cookies,timeout=3)
- time.sleep(5)
- content = json.loads(response.text)
- # with open("lagou_"+str(page)+'.html',"w",encoding='utf-8')as f:
- # f.write(response.text)
- result = content['content']['positionResult']['result']
- for i in result:
- self.data_list.append(i['companyShortName'])
- print("共获取到本页数据:%s条"%len(result))
- print(result)
-
- if __name__ == '__main__':
- kd = input("请输入职位名称(默认python):")
- if kd == '':
- kd = 'python'
- city = input("请输入城市(默认北京):")
- if city == '':
- city = '北京'
- lg = LG(city)
- # 先获取页面
- pages,headers = lg.get_page(city,kd)
- for page in range(1,pages+1):
- print("******第%s页******"%page)
- data = lg.main(kd,page)
- lg.parse(data,headers,page)
- print("展示公司名字:",lg.data_list)
虽然能爬取到数据,但是显示的数据和页面显示的不一样!