流程概要:
根据 url 和相关参数获取网页的 html ,对 html 解析后正则提取我们需要的标签信息,最终以 dataframe 二维表形式保存为 csv 文件,其中要注意:智联招聘在未登陆状态下无法爬取职位数据,于是我们可以先登陆网站,然后在浏览器开发者模式下找到需求头信息( Request Headers ),复制下来后通过 copyheaders 库转换为字典后加入 requests 请求的 headers 参数中。
代码:(附注释)
- #!/usr/bin/python3
- # -*- coding: utf-8 -*-
- """
- Created on Fri Aug 14 17:47:47 2020: 2021/3/30 上午1:13
- @Author : liudong
- @Software: PyCharm
- """
-
- import requests
- import re
- from copyheaders import headers_raw_to_dict
- from bs4 import BeautifulSoup
- import pandas as pd
-
-
- # 根据url和参数获取网页的HTML:
-
- def get_html(url, params):
-
- my_headers = b'''
- accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
- accept-language: zh-CN,zh;q=0.9
- cache-control: max-age=0
- cookie: x-zp-client-id=448f2b96-6b3a-48e3-e912-e6c8dd73e6cb; adfbid=0; adfbid2=0; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1617108464; sajssdk_2015_cross_new_user=1; sts_deviceid=178832cf3f2680-0b20242883a4a9-6618207c-1296000-178832cf3f3780; sts_sg=1; sts_chnlsid=Unknown; zp_src_url=https%3A%2F%2Fwww.google.com.hk%2F; FSSBBIl1UgzbN7N443S=kc8_mcJe5xsW.UilCMHXpkoWeyQ8te3q7QhYV8Y8aA0Se9k9JJXcnQVvrOJ9NYDP; locationInfo_search={%22code%22:%22538%22%2C%22name%22:%22%E4%B8%8A%E6%B5%B7%22%2C%22message%22:%22%E5%8C%B9%E9%85%8D%E5%88%B0%E5%B8%82%E7%BA%A7%E7%BC%96%E7%A0%81%22}; zp_passport_deepknow_sessionId=a2ea7206sade7641768f38078ea6b45afef0; at=02a0ea392e1d4fd6a4d6003ac136aae0; rt=82f98e13344843d6b5bf3dadf38e8bb2; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221071739258%22%2C%22first_id%22%3A%22178832cf3bd20f-0be4af1633ae3d-6618207c-1296000-178832cf3be4b8%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22178832cf3bd20f-0be4af1633ae3d-6618207c-1296000-178832cf3be4b8%22%7D; urlfrom=121126445; urlfrom2=121126445; adfcid=none; adfcid2=none; ZL_REPORT_GLOBAL={%22//www%22:{%22seid%22:%2202a0ea392e1d4fd6a4d6003ac136aae0%22%2C%22actionid%22:%2243ffc74e-c32e-42ee-ba04-1e24611fecde-cityPage%22}}; LastCity=%E4%B8%8A%E6%B5%B7; LastCity%5Fid=538; Hm_lpvt_38ba284938d5eddca645bb5e02a02006=1617111259; zpfe_probe_token=ae612f12s0feb44ac697a7434fe1f22af086; d4d6cd0b4a19fa72b8cc377185129bb7=ab637759-b57a-4214-a915-8dcbc5630065; selectCity_search=538; FSSBBIl1UgzbN7N443T=5pRoIYmxrZTzxVozDFEYjcClKKRpXbK9zf0gYH4zU5AyLqGUMT5fnVzyE0SMv7ZDGFLY0HV8o6iXLPBGBBTJhDhz3TIaQ3omm324Q2m4BSJzD0VgZzesPGIXudf636xQZkuag1QJmdqzgFLv6YPcKq.ukZPymp1IazfsOec5vBcMT9yemSrYb9UBk2XF.rZIeM3mIOBqpNii26kDRzjxHP5TsGLJzWaaZvklHnh61NT4acHPQt3Lq1.w2X4htg9ck.uGhzHt9w954igFEqhLCmggLi9OjPUaiU8TA4yn1oR1T5Qmjm1I5AA0PIu76e0T2u6w2f7thMkv6E7lkoDggrRMta0Z_uVEP3Y1sS8hJw7ycE2PTVtVassRyoN6UuTBHtSZ
- sec-ch-ua: "Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"
- sec-ch-ua-mobile: ?0
- sec-fetch-dest: document
- sec-fetch-mode: navigate
- sec-fetch-site: same-origin
- sec-fetch-user: ?1
- upgrade-insecure-requests: 1
- user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36
- '''
- my_headers = headers_raw_to_dict(my_headers) # 把复制的浏览器需求头转化为字典形式
- req = requests.get(url, headers=my_headers, params=params)
- req.encoding = req.apparent_encoding
- html = req.text
-
- return html
-
-
- # 输入url和城市编号,获取由所有职位信息的html标签的字符串组成的列表:
-
- def get_html_list(url, city_num):
-
- html_list = list()
-
- for i in range(1, 12):
- params = {'jl': str(city_num), 'kw': '数据分析师', 'p': str(i)}
- html = get_html(url, params)
- soup = BeautifulSoup(html, 'html.parser')
- html_list += soup.find_all(name='a', attrs={'class': 'joblist-box__iteminfo iteminfo'})
-
- for i in range(len(html_list)):
- html_list[i] = str(html_list[i])
-
- return html_list
-
-
- # 根据上面的HTML标签列表,把每个职位信息的有效数据提取出来,保存csv文件:
-
- def get_csv(html_list):
-
- # city = position = company_name = company_size = company_type = salary = education = ability = experience = evaluation = list() #
- # 上面赋值方法在这里是错误的,它会让每个变量指向同一内存地址,如果改变其中一个变量,其他变量会同时发生改变
-
- # table = pd.DataFrame(columns = ['城市','职位名称','公司名称','公司规模','公司类型','薪资','学历要求','技能要求','工作经验要求'])
- city, position, company_name, company_size, company_type, salary, education, ability, experience = ([] for i in range(9)) # 多变量一次赋值
-
- for i in html_list:
-
- if re.search(
- '<li class="iteminfo__line2__jobdesc__demand__item">(.*?)</li> <li class="iteminfo__line2__jobdesc__demand__item">(.*?)</li> <li class="iteminfo__line2__jobdesc__demand__item">(.*?)</li>',
- i):
- s = re.search(
- '<li class="iteminfo__line2__jobdesc__demand__item">(.*?)</li> <li class="iteminfo__line2__jobdesc__demand__item">(.*?)</li> <li class="iteminfo__line2__jobdesc__demand__item">(.*?)</li>',
- i).group(1)
- city.append(s)
- s = re.search(
- '<li class="iteminfo__line2__jobdesc__demand__item">(.*?)</li> <li class="iteminfo__line2__jobdesc__demand__item">(.*?)</li> <li class="iteminfo__line2__jobdesc__demand__item">(.*?)</li>',
- i).group(2)
- experience.append(s)
- s = re.search(
- '<li class="iteminfo__line2__jobdesc__demand__item">(.*?)</li> <li class="iteminfo__line2__jobdesc__demand__item">(.*?)</li> <li class="iteminfo__line2__jobdesc__demand__item">(.*?)</li>',
- i).group(3)
- education.append(s)
- else:
- city.append(' ')
- experience.append(' ')
- education.append(' ')
-
-
- if re.search('<span class="iteminfo__line1__jobname__name" title="(.*?)">', i):
- s = re.search('<span class="iteminfo__line1__jobname__name" title="(.*?)">', i).group(1)
- position.append(s)
- else:
- position.append(' ')
-
- if re.search('<span class="iteminfo__line1__compname__name" title="(.*?)">', i):
- s = re.search('<span class="iteminfo__line1__compname__name" title="(.*?)">', i).group(1)
- company_name.append(s)
- else:
- company_name.append(' ')
-
- if re.search(
- '<span class="iteminfo__line2__compdesc__item">(.*?) </span> <span class="iteminfo__line2__compdesc__item">(.*?) </span>',
- i):
- s = re.search(
- '<span class="iteminfo__line2__compdesc__item">(.*?) </span> <span class="iteminfo__line2__compdesc__item">(.*?) </span>',
- i).group(1)
- company_type.append(s)
- s = re.search(
- '<span class="iteminfo__line2__compdesc__item">(.*?) </span> <span class="iteminfo__line2__compdesc__item">(.*?) </span>',
- i).group(2)
- company_size.append(s)
- else:
- company_type.append(' ')
- company_size.append(' ')
-
- if re.search('<p class="iteminfo__line2__jobdesc__salary">([\s\S]*?)<', i):
- s = re.search('<p class="iteminfo__line2__jobdesc__salary">([\s\S]*?)<', i).group(1)
- s = s.strip()
- salary.append(s)
- else:
- salary.append(' ')
-
- s = str()
- l = re.findall('<div class="iteminfo__line3__welfare__item">.*?</div>', i)
- for i in l:
- s = s + re.search('<div class="iteminfo__line3__welfare__item">(.*?)</div>', i).group(1) + ' '
- ability.append(s)
-
- table = list(zip(city, position, company_name, company_size, company_type, salary, education, ability, experience))
-
- return table
-
-
-
- if __name__ == '__main__':
-
- url = 'https://sou.zhaopin.com/'
- citys = {'上海':538, '北京':530, '广州':763, '深圳':765, '天津':531, '武汉':736, '西安':854, '成都':801, '南京':635, '杭州':653, '重庆':551, '厦门':682}
- for i in citys.keys():
- html_list = get_html_list(url, citys[i])
- table = get_csv(html_list)
- df = pd.DataFrame(table, columns=['city', 'position', 'company_name', 'company_size', 'company_type', 'salary',
- 'education', 'ability', 'experience'])
- file_name = i + '.csv'
- df.to_csv(file_name)
-
结果:
流程概要:
先对数据结果进行清洗,salary属性下的字段都是类似于“8千-1.5万”这种无法进行后续统计和处理的字符串,我们需要将其全部修改为数值结果从而方便后续处理,此处我使用 pandas 和 re (正则表达式)把每个字段的薪资统一处理成了范围的中间值。关于薪资的缺失值,我本来打算用所在城市薪资的平均值替换处理,但考虑到后续可视化分析只使用到平均值,替换处理与否并不影响结果,故没有再做处理。由于时间关系,还有其他空值、异常值等数据清洗并没有再继续处理。最后,使用 matplotlib 和 seaborn 进行数据可视化,共得到5个结果图
代码:(附注释)
- #!/usr/bin/python3
- # -*- coding: utf-8 -*-
- """
- Created on Fri Aug 14 17:47:47 2020: 2021/4/2 上午1:30
- @Author : liudong
- @Software: PyCharm
- """
-
-
- import matplotlib.pyplot as plt
- import numpy as np
- import pandas as pd
- plt.rcParams['font.sans-serif'] = ['Heiti TC'] # 指定默认字体:解决plot不能显示中文问题
- plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
- import re
- import os
- import seaborn as sns
- from wordcloud import WordCloud
-
-
- citys = ['上海', '北京', '广州', '深圳', '天津', '武汉', '西安', '成都', '南京', '杭州', '重庆', '厦门']
-
-
- #数据清洗:
-
- def data_clear():
-
- for i in citys:
-
- file_name = './' + i + '.csv'
- df = pd.read_csv(file_name, index_col = 0)
-
- for i in range(0, df.shape[0]):
-
- s = df.loc[[i],['salary']].values.tolist()[0][0]
-
- if re.search('(.*)-(.*)',s):
- a = re.search('(.*)-(.*)', s).group(1)
- if a[-1] == '千':
- a = eval(a[0:-1]) * 1000
- elif a[-1] == '万':
- a = eval(a[0:-1]) * 10000
- b = re.search('(.*)-(.*)', s).group(2)
- if b[-1] == '千':
- b = eval(b[0:-1]) * 1000
- elif b[-1] == '万':
- b = eval(b[0:-1]) * 10000
- s = (a + b) / 2
- df.loc[[i], ['salary']] = s
- else:
- df.loc[[i], ['salary']] = ''
-
- os.remove(file_name)
- df.to_csv(file_name)
-
-
-
- #各个城市数据分析职位数量条形图:
-
- def citys_jobs():
-
- job_num = list()
- for i in citys:
- file_name = './' + i + '.csv'
- df = pd.read_csv(file_name, index_col = 0)
- job_num.append(df.shape[0])
- df = pd.DataFrame(list(zip(citys, job_num)))
- df = df.sort_values(1, ascending = False)
- x = list(df[0])
- y = list(df[1])
-
- fig = plt.figure(dpi=200)
- ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
- ax.bar(x,y,alpha = 0.8)
- ax.set_title('数据分析职位在全国主要城市的数量分布')
- ax.set_ylim(0,350)
-
- plt.savefig('./数据分析职位在全国主要城市的数量分布.jpg')
- plt.show()
-
-
- #不同城市薪资分布条形图:
-
- def citys_salary():
-
- y = list()
- x = citys
-
- for i in citys:
- file_name = './' + i + '.csv'
- df = pd.read_csv(file_name, index_col=0)
- y0 = df['salary'].mean()
- y.append(round(y0/1000, 1))
-
- df = pd.DataFrame(list(zip(x,y)))
- df = df.sort_values(1, ascending = False)
- x = list(df[0])
- y = list(df[1])
-
- fig = plt.figure(dpi=200)
- ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
- ax.bar(x, y, alpha = 0.8)
- ax.set_title('数据分析职位在一些主要城市的薪资分布(单位:千)')
- ax.set_ylim(5, 18)
- for a, b, label in zip(x, y, y): # 内置函数zip():将几个列表合并为二维列表并转置,返回一个特殊对象,可通过list()列表化之后查看
- plt.text(a, b, label, horizontalalignment = 'center', fontsize = 10) # plt.text()函数:在图中(a,b)位置添加一个文字标签label
-
- plt.savefig('./数据分析职位在一些主要城市的薪资分布.jpg')
- plt.show()
-
-
- #数据分析岗位总体薪资的分布
-
- def salary_distribute():
-
- salary_list = list()
- for i in citys:
- file_name = './' + i + '.csv'
- df = pd.read_csv(file_name, index_col = 0)
- salary_list += list(df['salary'])
- salarys = list()
- for i in range(len(salary_list)):
- if not pd.isnull(salary_list[i]): #由于该列表是从pandas中读出的数据,故不能用if salary_list[i] == np.nan,会识别不出来
- salarys.append(round(salary_list[i]/1000, 1))
- mean = np.mean(salarys)
-
- plt.figure(dpi=200)
- sns.distplot(salarys, hist = True, kde = True, kde_kws={"color":"r", "lw":1.5, 'linestyle':'-'})
- plt.axvline(mean, color='r', linestyle=":")
- plt.text(mean, 0.01, '平均薪资: %.1f千'%(mean), color='r', horizontalalignment = 'center', fontsize = 15)
- plt.xlim(0,50)
- plt.xlabel('薪资分布(单位:千)')
- plt.title('数据分析职位整体薪资分布')
- plt.savefig('./数据分析职位整体薪资分布.jpg')
- plt.show()
-
-
- #数据分析职位对学历要求的分布
-
- def education_distribute():
-
- table = pd.DataFrame()
- for i in citys:
- file_name = './' + i + '.csv'
- df = pd.read_csv(file_name, index_col=0)
- table = pd.concat([table, df])
- table = pd.DataFrame(pd.value_counts(table['education']))
- table = table.sort_values(['education'], ascending = False)
- x = list(table.index)
- y = list(table['education'])
- print(x)
-
- fig = plt.figure(dpi=200)
- ax = fig.add_axes([0.1,0.1,0.8,0.8])
- explode = (0, 0, 0, 0.2, 0.4, 0.6, 0.8)
- ax.axis('equal')
- ax.pie(y,labels = x,autopct='%.1f%%',explode=explode) #autopct显示每块饼的百分比属性且自定义格式化字符串,其中%%表示字符串%,类似正则
- ax.set_title('数据分析职位对学历要求的占比')
- ax.legend(x, loc = 1)
- plt.savefig('./数据分析职位对学历要求的占比.jpg')
- plt.show()
-
-
- #技能关键词频统计
-
- def wordfrequence():
-
- table = pd.DataFrame()
- for i in citys:
- file_name = './' + i + '.csv'
- df = pd.read_csv(file_name, index_col=0)
- table = pd.concat([table, df])
- l1 = list(table['ability'])
- l2 = list()
- for i in range(len(l1)):
- if not pd.isnull(l1[i]):
- l2.append(l1[i])
- words = ''.join(l2)
-
- cloud = WordCloud(
- font_path='/System/Library/Fonts/STHeiti Light.ttc', # 设置字体文件获取路径,默认字体不支持中文
- background_color='white', # 设置背景颜色 默认是black
- max_words=20, # 词云显示的最大词语数量
- random_state = 1, # 设置随机生成状态,即多少种配色方案
- collocations = False, # 是否包括词语之间的搭配,默认True,可能会产生语意重复的词语
- width=1200, height=900 # 设置大小,默认图片比较小,模糊
- ).generate(words)
- plt.figure(dpi=200)
- plt.imshow(cloud) # 该方法用来在figure对象上绘制传入图像数据参数的图像
- plt.axis('off') # 设置词云图中无坐标轴
- plt.savefig("./技能关键词频统计.jpg")
- plt.show()
-
-
- if __name__ == "__main__":
-
- data_clear()
- citys_jobs()
- citys_salary()
- salary_distribute()
- wordfrequence()
-
结果:
1). 在这12个城市总体的薪资分布情况:(直方图+核密度分布函数)
可以看出,数据分析职位整体上薪资分布大致符合左偏态分布,薪资分布的密集区间大约在8k-15k之间,而平均薪资12.4k在整个IT行业中大致处于中等薪酬位置
2). 在不同城市的职位招聘数量分布情况:(已降序处理)
可以看出,一线城市北上广深位列榜首,结果符合常理,接下来是成都、杭州、西安,如果想去二线城市发展,这几个城市应该重点考虑。
3). 在不同城市的薪资分布情况:(已降序处理)
可以看出,在不同城市间的薪资分布大致与上面的职位数量分布相似,但出乎意料的是,广州被二线城市杭州、南京超越,这可能是由于杭州的阿里巴巴公司等以及南京的苏宁等这些大公司拉高了杭州和南京的数据分析薪资水平,也可能是爬取的智联招聘网站的数据样本有一定的局限性,具体原因有待进一步考查。
4). 招聘中对学历要求的占比:
可以看出,本科占比最大,为66.5%,其次是大专,而其余学历加起来只占比7.1%,因此,数据分析职位目前对于学历要求相对较低,适合于不打算读研的本科毕业生等人群。
5). 职位技能要求关键词频统计:
可以看出,数据分析职位主要需求技能是Python、SQL、数据挖掘、大数据、数据建模等,因此熟练掌握这些技能能够增加求职中的核心竞争力。