您当前的位置:首页 > 计算机 > 编程开发 > Python

Python爬取拉勾网招聘信息,解决“您操作太频繁,请稍后访问”

时间:12-06来源:作者:点击数:

Python爬取拉勾网招聘信息,解决“您操作太频繁,请稍后访问”

# -*- coding: utf-8 -*-
#!/usr/bin/env python
# 爬取拉勾网职位信息

import time,re,json
import requests
from urllib.parse import quote
from lxml import etree

class LG(object):
    def __init__(self,city):
        self.data_list = []
        self.start_url = 'https://www.lagou.com/jobs/list_'
        # self.start_url = 'https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB/p-city_213'
        self.url = 'https://www.lagou.com/jobs/positionAjax.json'
        default_headers= {
            'Host': 'www.lagou.com',
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36",
        }
        self.query = dict(
            city=city,
            needAddtionalResult="false"
        )
        self.s = requests.Session()
        self.get_response = requests.get(url=self.start_url, headers=default_headers, timeout=3)

    def get_page(self,city,kd):
        kd = quote(kd)
        patt = re.compile(r'global.cityNumMap =(.*?);')
        cityNumMap = patt.findall(self.get_response.text)[0]
        # 城市对应的号码,供链接地址city_url使用
        dic_cityNum = json.loads(cityNumMap)
        cityNum = dic_cityNum[city]
        city_url = f'https://www.lagou.com/jobs/list_{kd}/p-city_{cityNum}'
        self.headers = {
            # 'Host': 'www.lagou.com',
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36",
            'Referer': city_url
        }
        # 目的为了获取cookie和后面的页数
        response = self.s.get(url=city_url, headers=self.headers, timeout=3)
        html = etree.HTML(response.text)
        pages = html.xpath('//div[@class="page-number"]/span[last()]/text()')[0]
        print(f"总共获取到{pages}页数据!")
        return (int(pages),self.headers)

    def main(self,kd,page):
        if page == 1:
            first = 'true'
        else:
            first = 'false'
        self.data = dict(
            first=first,
            pn=page,
            kd=kd
        )
        return self.data
    def parse(self,data,headers,page):
        cookies = self.s.cookies
        response = self.s.post(url=self.url,params=self.query,data=data,headers=headers,cookies=cookies,timeout=3)
        time.sleep(5)
        content = json.loads(response.text)
        # with open("lagou_"+str(page)+'.html',"w",encoding='utf-8')as f:
        #     f.write(response.text)
        result = content['content']['positionResult']['result']
        for i in result:
            self.data_list.append(i['companyShortName'])
        print("共获取到本页数据:%s条"%len(result))
        print(result)

if __name__ == '__main__':
    kd = input("请输入职位名称(默认python):")
    if kd == '':
        kd = 'python'
    city = input("请输入城市(默认北京):")
    if city == '':
        city = '北京'
    lg = LG(city)
    # 先获取页面
    pages,headers  = lg.get_page(city,kd)
    for page in range(1,pages+1):
        print("******第%s页******"%page)
        data = lg.main(kd,page)
        lg.parse(data,headers,page)
    print("展示公司名字:",lg.data_list)

虽然能爬取到数据,但是显示的数据和页面显示的不一样!

方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门
本栏推荐