爬取百度招聘,链接:https://talent.baidu.com/external/baidu/index.html#/social/2
首先我们先打开链接看一下数据是否在响应的html源码中,打开之后我们宁没有发现,那他就是动态加载的,
这时我们就找到了,通过ajax进行加载,我们直接对这个链接进行发送请求 https://talent.baidu.com/baidu/web/httpservice/getPostList?workPlace=0%2F4%2F7%2F9&recruitType=2&pageSize=10&curPage=1&keyWord=&_=1611122380527,这时我们发现 work_palce是工作地址,curPage是当前页,我们直接开始写代码,这是一个json数据,我们使用json模块进行loads一下转换为字典,方便我们取值
import requests
import json
import csv
class Baidu(object):
def __init__(self):
self.url = 'https://talent.baidu.com/baidu/web/httpservice/getPostList?workPlace=0%2F4%2F7%2F9&recruitType=2' \
'&pageSize=10&curPage=1&keyWord=&_=1608559330677 '
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/87.0.4280.88 Safari/537.36 '
}
def get_data(self):
response = requests.get(url=self.url, headers=self.headers)
print(response.url)
return response
def parse_data(self,response):
data = json.loads(response.content)
data_list = data["postList"]
data_node = list()
for node in data_list:
temp= {}
try:
temp['序列'] = node["postId"]
except:
temp['序列'] = None
try:
temp['工作名称'] = node["name"]
except:
temp['工作名称'] =None
try:
temp['日期'] = node["publishDate"]
except:
temp['日期'] = None
try:
temp['类型'] = node["postType"]
except:
temp['类型'] = None
try:
temp['工作地址'] = node["workPlace"]
except:
temp['工作地址'] = None
try:
temp['工作经验'] = node["workYears"]
except:
temp['工作经验'] = None
try:
temp['人数'] = node["recruitNum"]
except:
temp['人数'] = None
try:
temp['工作类型'] = node["workType"]
except:
temp['工作类型'] = None
try:
temp['学历'] = node["education"]
except:
temp['学历'] = None
try:
temp['描述'] = node["serviceCondition"].replace("<br>-","").replace('\r','').replace('-','')
except:
temp['描述'] = None
try:
temp['工作要求'] = node["workContent"].replace("<br>-","").replace('\r','').replace('-','')
except:
temp['工作要求'] = None
try:
temp['公司名字'] = node["orgName"]
except:
temp['公司名字'] = None
data_node.append(temp)
return data_node
def save_data(self,data_node):
for data in data_node:
csv_writer.writerow([data['序列'],data['工作名称'],data['日期'],data['类型'],data['工作地址'],data['工作经验'],data['人数'],data['工作类型'],data['学历'],
data['描述'],data['工作要求'],data['公司名字']])
def run(self):
response = self.get_data()
data_node = self.parse_data(response)
self.save_data(data_node)
if __name__ == '__main__':
head = ['序列', '工作名称', '日期', '类型', '工作地址', '工作经验', '人数', '工作类型', '学历',
'描述', '工作要求', '公司名字']
with open('百度招聘.csv', 'a', newline='', encoding="gb18030") as f:
csv_writer = csv.writer(f)
csv_writer.writerow(head)
baidu = Baidu()
baidu.run()
我们现在能拿到一页的数据了,我们再看下链接变化:
第一页:https://talent.baidu.com/baidu/web/httpservice/getPostList?workPlace=0%2F4%2F7%2F9&recruitType=2&pageSize=10&curPage=1&keyWord=&_=1611122380527
第二页:https://talent.baidu.com/baidu/web/httpservice/getPostList?postType=&workPlace=0%2F4%2F7%2F9&recruitType=2&keyWord=&pageSize=10&curPage=2&keyWord2=&_=1611122990856
我们会发现curPage是控制页码的,那我们就直接拼接成,代码如下:
def next_page(self):
for i in range(2, 243):
next_url = 'https://talent.baidu.com/baidu/web/httpservice/getPostList?workPlace=0%2F4%2F7%2F9' \
'&recruitType=2&pageSize=10&curPage={}&keyWord=&_=1608559330677'.format(i)
self.url = next_url
response = self.get_data()
data_node = self.parse_data(response)
self.save_data(data_node)
这样就实现了翻页效果,并且把数据存到了csv,方便以后的读取和使用,至于上边为什么try、except,是因为在取值的时候有些为空,抛出异常,那我们就阻止抛出异常!
总代码如下:
import requests
import json
import csv
class Baidu(object):
def __init__(self):
self.url = 'https://talent.baidu.com/baidu/web/httpservice/getPostList?workPlace=0%2F4%2F7%2F9&recruitType=2' \
'&pageSize=10&curPage=1&keyWord=&_=1608559330677 '
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/87.0.4280.88 Safari/537.36 '
}
def get_data(self):
response = requests.get(url=self.url, headers=self.headers)
print(response.url)
return response
def parse_data(self,response):
data = json.loads(response.content)
data_list = data["postList"]
data_node = list()
for node in data_list:
temp= {}
try:
temp['序列'] = node["postId"]
except:
temp['序列'] = None
try:
temp['工作名称'] = node["name"]
except:
temp['工作名称'] =None
try:
temp['日期'] = node["publishDate"]
except:
temp['日期'] = None
try:
temp['类型'] = node["postType"]
except:
temp['类型'] = None
try:
temp['工作地址'] = node["workPlace"]
except:
temp['工作地址'] = None
try:
temp['工作经验'] = node["workYears"]
except:
temp['工作经验'] = None
try:
temp['人数'] = node["recruitNum"]
except:
temp['人数'] = None
try:
temp['工作类型'] = node["workType"]
except:
temp['工作类型'] = None
try:
temp['学历'] = node["education"]
except:
temp['学历'] = None
try:
temp['描述'] = node["serviceCondition"].replace("<br>-","").replace('\r','').replace('-','')
except:
temp['描述'] = None
try:
temp['工作要求'] = node["workContent"].replace("<br>-","").replace('\r','').replace('-','')
except:
temp['工作要求'] = None
try:
temp['公司名字'] = node["orgName"]
except:
temp['公司名字'] = None
data_node.append(temp)
return data_node
def save_data(self,data_node):
for data in data_node:
csv_writer.writerow([data['序列'],data['工作名称'],data['日期'],data['类型'],data['工作地址'],data['工作经验'],data['人数'],data['工作类型'],data['学历'],
data['描述'],data['工作要求'],data['公司名字']])
def next_page(self):
for i in range(2, 243):
next_url = 'https://talent.baidu.com/baidu/web/httpservice/getPostList?workPlace=0%2F4%2F7%2F9' \
'&recruitType=2&pageSize=10&curPage={}&keyWord=&_=1608559330677'.format(i)
self.url = next_url
response = self.get_data()
data_node = self.parse_data(response)
self.save_data(data_node)
def run(self):
response = self.get_data()
data_node = self.parse_data(response)
self.save_data(data_node)
self.next_page()
if __name__ == '__main__':
head = ['序列', '工作名称', '日期', '类型', '工作地址', '工作经验', '人数', '工作类型', '学历',
'描述', '工作要求', '公司名字']
with open('百度招聘.csv', 'a', newline='', encoding="gb18030") as f:
csv_writer = csv.writer(f)
csv_writer.writerow(head)
baidu = Baidu()
baidu.run()
希望大家多多点赞