爬取百度招聘,链接:https://talent.baidu.com/external/baidu/index.html#/social/2
首先我们先打开链接看一下数据是否在响应的html源码中,打开之后我们宁没有发现,那他就是动态加载的,
这时我们就找到了,通过ajax进行加载,我们直接对这个链接进行发送请求 https://talent.baidu.com/baidu/web/httpservice/getPostList?workPlace=0%2F4%2F7%2F9&recruitType=2&pageSize=10&curPage=1&keyWord=&_=1611122380527,这时我们发现 work_palce是工作地址,curPage是当前页,我们直接开始写代码,这是一个json数据,我们使用json模块进行loads一下转换为字典,方便我们取值
- import requests
- import json
- import csv
-
-
- class Baidu(object):
- def __init__(self):
- self.url = 'https://talent.baidu.com/baidu/web/httpservice/getPostList?workPlace=0%2F4%2F7%2F9&recruitType=2' \
- '&pageSize=10&curPage=1&keyWord=&_=1608559330677 '
- self.headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
- 'Chrome/87.0.4280.88 Safari/537.36 '
- }
-
- def get_data(self):
- response = requests.get(url=self.url, headers=self.headers)
- print(response.url)
- return response
-
- def parse_data(self,response):
- data = json.loads(response.content)
- data_list = data["postList"]
- data_node = list()
- for node in data_list:
- temp= {}
- try:
- temp['序列'] = node["postId"]
- except:
- temp['序列'] = None
- try:
- temp['工作名称'] = node["name"]
- except:
- temp['工作名称'] =None
- try:
- temp['日期'] = node["publishDate"]
- except:
- temp['日期'] = None
- try:
- temp['类型'] = node["postType"]
- except:
- temp['类型'] = None
- try:
- temp['工作地址'] = node["workPlace"]
- except:
- temp['工作地址'] = None
- try:
- temp['工作经验'] = node["workYears"]
- except:
- temp['工作经验'] = None
- try:
- temp['人数'] = node["recruitNum"]
- except:
- temp['人数'] = None
- try:
- temp['工作类型'] = node["workType"]
- except:
- temp['工作类型'] = None
- try:
- temp['学历'] = node["education"]
- except:
- temp['学历'] = None
- try:
- temp['描述'] = node["serviceCondition"].replace("<br>-","").replace('\r','').replace('-','')
- except:
- temp['描述'] = None
- try:
- temp['工作要求'] = node["workContent"].replace("<br>-","").replace('\r','').replace('-','')
- except:
- temp['工作要求'] = None
- try:
- temp['公司名字'] = node["orgName"]
- except:
- temp['公司名字'] = None
- data_node.append(temp)
- return data_node
-
- def save_data(self,data_node):
- for data in data_node:
- csv_writer.writerow([data['序列'],data['工作名称'],data['日期'],data['类型'],data['工作地址'],data['工作经验'],data['人数'],data['工作类型'],data['学历'],
- data['描述'],data['工作要求'],data['公司名字']])
-
-
- def run(self):
- response = self.get_data()
- data_node = self.parse_data(response)
- self.save_data(data_node)
-
-
- if __name__ == '__main__':
- head = ['序列', '工作名称', '日期', '类型', '工作地址', '工作经验', '人数', '工作类型', '学历',
- '描述', '工作要求', '公司名字']
- with open('百度招聘.csv', 'a', newline='', encoding="gb18030") as f:
- csv_writer = csv.writer(f)
- csv_writer.writerow(head)
- baidu = Baidu()
- baidu.run()
-
-
我们现在能拿到一页的数据了,我们再看下链接变化:
第一页:https://talent.baidu.com/baidu/web/httpservice/getPostList?workPlace=0%2F4%2F7%2F9&recruitType=2&pageSize=10&curPage=1&keyWord=&_=1611122380527
第二页:https://talent.baidu.com/baidu/web/httpservice/getPostList?postType=&workPlace=0%2F4%2F7%2F9&recruitType=2&keyWord=&pageSize=10&curPage=2&keyWord2=&_=1611122990856
我们会发现curPage是控制页码的,那我们就直接拼接成,代码如下:
- def next_page(self):
- for i in range(2, 243):
- next_url = 'https://talent.baidu.com/baidu/web/httpservice/getPostList?workPlace=0%2F4%2F7%2F9' \
- '&recruitType=2&pageSize=10&curPage={}&keyWord=&_=1608559330677'.format(i)
- self.url = next_url
- response = self.get_data()
- data_node = self.parse_data(response)
- self.save_data(data_node)
-
这样就实现了翻页效果,并且把数据存到了csv,方便以后的读取和使用,至于上边为什么try、except,是因为在取值的时候有些为空,抛出异常,那我们就阻止抛出异常!
总代码如下:
- import requests
- import json
- import csv
-
-
- class Baidu(object):
- def __init__(self):
- self.url = 'https://talent.baidu.com/baidu/web/httpservice/getPostList?workPlace=0%2F4%2F7%2F9&recruitType=2' \
- '&pageSize=10&curPage=1&keyWord=&_=1608559330677 '
- self.headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
- 'Chrome/87.0.4280.88 Safari/537.36 '
- }
-
- def get_data(self):
- response = requests.get(url=self.url, headers=self.headers)
- print(response.url)
- return response
-
- def parse_data(self,response):
- data = json.loads(response.content)
- data_list = data["postList"]
- data_node = list()
- for node in data_list:
- temp= {}
- try:
- temp['序列'] = node["postId"]
- except:
- temp['序列'] = None
- try:
- temp['工作名称'] = node["name"]
- except:
- temp['工作名称'] =None
- try:
- temp['日期'] = node["publishDate"]
- except:
- temp['日期'] = None
- try:
- temp['类型'] = node["postType"]
- except:
- temp['类型'] = None
- try:
- temp['工作地址'] = node["workPlace"]
- except:
- temp['工作地址'] = None
- try:
- temp['工作经验'] = node["workYears"]
- except:
- temp['工作经验'] = None
- try:
- temp['人数'] = node["recruitNum"]
- except:
- temp['人数'] = None
- try:
- temp['工作类型'] = node["workType"]
- except:
- temp['工作类型'] = None
- try:
- temp['学历'] = node["education"]
- except:
- temp['学历'] = None
- try:
- temp['描述'] = node["serviceCondition"].replace("<br>-","").replace('\r','').replace('-','')
- except:
- temp['描述'] = None
- try:
- temp['工作要求'] = node["workContent"].replace("<br>-","").replace('\r','').replace('-','')
- except:
- temp['工作要求'] = None
- try:
- temp['公司名字'] = node["orgName"]
- except:
- temp['公司名字'] = None
- data_node.append(temp)
- return data_node
-
- def save_data(self,data_node):
- for data in data_node:
- csv_writer.writerow([data['序列'],data['工作名称'],data['日期'],data['类型'],data['工作地址'],data['工作经验'],data['人数'],data['工作类型'],data['学历'],
- data['描述'],data['工作要求'],data['公司名字']])
-
- def next_page(self):
- for i in range(2, 243):
- next_url = 'https://talent.baidu.com/baidu/web/httpservice/getPostList?workPlace=0%2F4%2F7%2F9' \
- '&recruitType=2&pageSize=10&curPage={}&keyWord=&_=1608559330677'.format(i)
- self.url = next_url
- response = self.get_data()
- data_node = self.parse_data(response)
- self.save_data(data_node)
-
-
- def run(self):
- response = self.get_data()
- data_node = self.parse_data(response)
- self.save_data(data_node)
- self.next_page()
-
-
- if __name__ == '__main__':
- head = ['序列', '工作名称', '日期', '类型', '工作地址', '工作经验', '人数', '工作类型', '学历',
- '描述', '工作要求', '公司名字']
- with open('百度招聘.csv', 'a', newline='', encoding="gb18030") as f:
- csv_writer = csv.writer(f)
- csv_writer.writerow(head)
- baidu = Baidu()
- baidu.run()
-
-
希望大家多多点赞