爬取51job网站的数据分析

爬取51job网站的数据分析

爬取数据

  1. # -*- coding:utf-8 -*-
  2. # @Time : 2020-11-10 20:57
  3. # @Author : BGLB
  4. # @Software : PyCharm
  5. import csv
  6. from decimal importDecimal
  7. import hashlib
  8. import json
  9. import logging
  10. import logging.config
  11. import os
  12. import random
  13. import re
  14. import time
  15. from urllib import parse
  16. from lxml import html
  17. from requests import get
  18. etree = html.etree
  19. headers ={
  20. "Host":"search.51job.com",
  21. "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4043.400",
  22. }
  23. def time_logging(func):
  24. """
  25. 记录函数运行时间的装饰器
  26. :param func: 需要记录的函数名
  27. :return:
  28. """
  29. def wrapper(*args,**kw):
  30. start_time = time.time()
  31. func_result = func(*args,**kw)
  32. runtime = time.time()-start_time
  33. if runtime <60:
  34. runtime ="{:.2f}s".format(runtime)
  35. elif runtime <3600:
  36. runtime ="{:.2f}m".format(runtime/60)
  37. else:
  38. runtime ="{:.2f}h".format(runtime/3600)
  39. content ='[{0:^15}] - 运行时间 - [{1:^6}]'.format(func.__name__, runtime)
  40. # print(content)
  41. logging.info(content)
  42. with open("./log/runtime.log",'a', encoding='utf8')as f:
  43. f.writelines(content+'\n')
  44. return func_result
  45. return wrapper
  46. def search_job(job_key, page_num=1):
  47. """
  48. 搜索上海 广州 深圳 武汉 四个城市的岗位信息
  49. 一页有五十个岗位,
  50. """
  51. url ="https://search.51job.com/list/020000%252C030200%252C040000%252C180200,000000,0000,00,9,99,{},2,{}.html"
  52. response = get(url.format(parse.quote(
  53. parse.quote(job_key)), page_num), headers=headers)
  54. html = response.content.decode(response.encoding)
  55. eroot = etree.HTML(html)
  56. table_list = eroot.xpath('//script[@type="text/javascript"]')
  57. # print(table_list[2].text)
  58. # print(table_list[2].text.split("=",1)[-1])
  59. json_str = json.loads(table_list[2].text.split("=",1)[-1])
  60. return json_str
  61. @time_logging
  62. def parse_job_msg(search_result):
  63. """
  64. 解析搜索到的岗位信息
  65. 解析一个列表
  66. """
  67. print(
  68. "-------------正在解析第{}个页面数据--------------".format(search_result["curr_page"]))
  69. job_msg_list = search_result["engine_search_result"]# 一页50个岗位
  70. csv_list =[]
  71. for job_msg in job_msg_list:
  72. # 工作id
  73. jobid = job_msg["jobid"]
  74. # 公司id
  75. coid = job_msg["coid"]
  76. # 工作url
  77. job_href = job_msg["job_href"]
  78. if job_href.split("/")[2].split(".")[0]=="jobs":
  79. job_detail_str = get_job_msg(job_href)
  80. else:
  81. pattern = re.compile(r'<[^>]+>', re.S)
  82. job_detail_str = pattern.sub(
  83. '', get_51rz_json("job_detail",{"jobid": jobid}))
  84. # 工作名称
  85. job_name = job_msg["job_name"]
  86. # 公司url
  87. co_href = job_msg["company_href"]
  88. # 公司名称
  89. co_name = job_msg["company_name"]
  90. # 薪资情况 处理成 最高 最低 平均值
  91. money = job_msg["providesalary_text"]
  92. # 工作地点
  93. workarea = job_msg["workarea_text"]
  94. # 公司类型
  95. co_type = job_msg["companytype_text"]
  96. # 发布时间
  97. update_time = job_msg["issuedate"]
  98. # 工作福利
  99. jobwelf = job_msg["jobwelf"]
  100. if money ==""or money isNone:
  101. logging.error("{}的工作薪资{}获取失败".format(job_href, money))
  102. continue
  103. # 'attribute_text': ['上海-闵行区', '1年经验', '大专', '招2人']
  104. job_attr = job_msg["attribute_text"]
  105. job_po_tmp = job_year_tmp =""
  106. job_education ="不限"
  107. for x in job_attr:
  108. if'招'in x:
  109. job_po_tmp = x
  110. if'经验'in x:
  111. job_year_tmp = x
  112. if x in"高中大专本科博士硕士":
  113. job_education = x
  114. panter = re.compile(r'\d+')
  115. if len(panter.findall(job_po_tmp))>0:
  116. job_po = int(panter.findall(job_po_tmp)[0])
  117. else:
  118. job_po =0
  119. if len(panter.findall(job_year_tmp))>0:
  120. job_year = int(panter.findall(job_year_tmp)[0])
  121. else:
  122. job_year =0
  123. # 公司人数
  124. co_people = job_msg["companysize_text"]
  125. # 公司经营范围
  126. co_jx = job_msg['companyind_text']
  127. ss_s = money.split("-")
  128. if len(ss_s)<2:
  129. money_min = money_max =0
  130. else:
  131. money_min, money_max = parse_money(money)
  132. csv_dict ={
  133. "职位名称": job_name,
  134. "最低薪资(千/月)": money_min,
  135. "最高薪资(千/月)": money_max,
  136. "招聘人数": job_po,
  137. "工作经验(年)": job_year,
  138. "最低学历": job_education,
  139. "工作地点": workarea.split("-")[0],
  140. "工作福利": jobwelf,
  141. "职位描述和详细条件": job_detail_str,
  142. "公司名称": co_name,
  143. "公司类型": co_type,
  144. "公司人数": co_people,
  145. "公司经营范围": co_jx,
  146. "职位详情url": job_href,
  147. "公司详情url": co_href,
  148. "发布时间": update_time,
  149. }
  150. csv_list.append(csv_dict)
  151. return csv_list
  152. def parse_money(money_text):
  153. money_min = money_max =0
  154. ss_s = money_text.split("-")
  155. if len(ss_s)>=2:
  156. money_min =Decimal(ss_s[0])
  157. money_max =Decimal(ss_s[1].split("/")[0][:-1])
  158. if money_text.split('/')[0][-1]=="万":
  159. money_min =10*money_min
  160. money_max =10*money_max
  161. if money_text.split('/')[-1]=="年":
  162. money_max /=12
  163. money_min /=12
  164. return[money_min.quantize(Decimal("0.00")), money_max.quantize(Decimal("0.00"))]
  165. def init_params(oparams):
  166. """
  167. 通过对js的解析 复写出初始化查询参数的方法
  168. """
  169. key ="tuD&#mheJQBlgy&Sm300l8xK^X4NzFYBcrN8@YLCret$fv1AZbtujg*KN^$YnUkh"
  170. keyindex = random.randint(4,40)
  171. sParams = json.dumps(oparams)
  172. md5 = hashlib.md5()
  173. md5.update(("coapi"+sParams+str(key[keyindex:keyindex+15])).encode("utf8"))
  174. sign = md5.hexdigest()
  175. # print(md5.hexdigest())
  176. return{
  177. "key": keyindex,
  178. "sign": sign,
  179. "params": sParams
  180. }
  181. @time_logging
  182. def get_51rz_json(interface: str, params: dict):
  183. """
  184. 针对对51rz.51job 的接口进行封装
  185. 查询工作列表 job_list
  186. 查询工作详情 job_detail {"jobid":126817691}
  187. 查询公司列表 commpany_list
  188. 查询公司详情 commpany_detail {"coid":}
  189. 查询工作条件 job_condition
  190. 查询工作时间表 job_time_table
  191. """
  192. url_interface ={
  193. "job_list":"https://coapi.51job.com/job_list.php",
  194. "job_detail":"https://coapi.51job.com/job_detail.php",
  195. "commpany_list":"https://coapi.51job.com/co_list.php",
  196. "commpany_detail":"https://coapi.51job.com/job_company.php",
  197. "job_condition":"https://coapi.51job.com/job_condition.php",# 工作条件
  198. "job_time_table":"https://coapi.51job.com/job_schedule.php",# 工作时间表
  199. }
  200. header ={
  201. "Host":"coapi.51job.com",
  202. "Referer":"https://51rz.51job.com/",
  203. "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
  204. }
  205. url = url_interface[interface]
  206. res = get(url, init_params(params), headers=header)
  207. # print(res.url)
  208. res_str = res.content.decode("utf8")
  209. filename ="{}".format(interface)
  210. for x in params.values():
  211. filename += str("_"+x)
  212. res_json = res_str.split("(",1)[-1][0:-1]
  213. res_dict = dict(json.loads(res_json))
  214. res_dict["html_url"]= res.url
  215. write_file(filename,"json", res_dict)
  216. # print(res_dict["resultbody"]["jobinfo"])
  217. return res_dict["resultbody"]["jobinfo"]
  218. @time_logging
  219. def get_job_msg(job_detail_url):
  220. """
  221. 工作职位描述和详细条件
  222. """
  223. try:
  224. job_detail_res = get(job_detail_url, headers=headers)
  225. html = job_detail_res.content
  226. eroot = etree.HTML(html)
  227. job_name = eroot.xpath(
  228. "/html/body/div[3]/div[2]/div[2]/div/div[1]/h1[1]/text()")[0]
  229. co_name = eroot.xpath(
  230. '/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/@title')[0]
  231. jobid = eroot.xpath('//*[@id="hidJobID"]/@value')[0]
  232. _content = eroot.xpath(
  233. '//div[@class="tCompany_center clearfix"]//text()')
  234. exceptExceptionas e:
  235. logging.error("解析[{}]-失败- {}".format(job_detail_url, e))
  236. return""
  237. filename ="{0}-{1}-{2}".format(job_name, co_name,
  238. jobid).replace("(","").replace(")","").replace("/","_").replace("*","")
  239. # print(_content)
  240. # write_file(filename, "html", _content)
  241. # 工作职位描述和详细条件
  242. job_msg_str = eroot.xpath("//div[@class='bmsg job_msg inbox']/p/text()")
  243. # 简单的数据清洗
  244. for i in range(len(job_msg_str)):
  245. job_msg_str[i]="".join(job_msg_str[i].split())
  246. return"".join(job_msg_str)
  247. def write_file(filename, fileext, datas):
  248. """
  249. 写入文件
  250. """
  251. fileext_ignore =["html","log"]# 忽略输出的文件后缀
  252. ifnot os.path.exists("./data/{}".format(fileext)):
  253. os.makedirs("./data/{}".format(fileext))
  254. filenames ="{0}.{1}".format(filename, fileext).replace(
  255. "/","_").replace("\\","_")
  256. filepath ="./data/{0}/{1}".format(fileext, filenames)
  257. is_write = os.path.exists(filepath)
  258. try:
  259. with open(filepath,'a', encoding="utf8", newline="")as f:
  260. if fileext notin fileext_ignore:
  261. print("正在写入文件-[{0}].....".format(filenames))
  262. if fileext =="csv":
  263. if'dict'in str(type(datas[0])):
  264. header =[x for x in datas[0].keys()]
  265. # print(type(header), header)
  266. # 提前预览列名,当下面代码写入数据时,会将其一一对应。
  267. writer = csv.DictWriter(f, fieldnames=header)
  268. ifnot is_write:
  269. writer.writeheader()# 写入列名
  270. writer.writerows(datas)# 写入数据
  271. elif'list'in str(type(datas[0])):
  272. writer = csv.writer(f)
  273. writer.writerows(datas)
  274. else:
  275. csv.writer(f).writerows(datas)
  276. elif fileext =='json':
  277. json.dump(datas, f, ensure_ascii=False)
  278. else:
  279. f.writelines(datas)
  280. if fileext notin fileext_ignore:
  281. print("[{}]-共写入{}条数据".format(filenames, len(datas)))
  282. logging.info(
  283. "文件-[{0}]-写入成功,共有{1}条数据".format(filenames, len(datas)))
  284. exceptExceptionas e:
  285. logging.error(
  286. "文件-[{}]-写入出错:{},数据详情:数据{},数据长度{}".format(filenames, e, datas, len(datas)))
  287. @time_logging
  288. def parse_key(key, pages=1):
  289. """
  290. 爬取并处理某一个关键字的岗位信息
  291. :param key: 关键字
  292. :param pages: 爬取页数
  293. :return:
  294. """
  295. search_job_dict = search_job(key)
  296. try:
  297. total_page = int(search_job_dict["total_page"])
  298. exceptTypeErroras e:
  299. total_page =0
  300. print("不存在与{}相关的岗位,请尝试换个关键字".format(key))
  301. logging.error("不存在与{}相关的岗位,请尝试换个关键字,{}".format(key, e))
  302. print("----------------与{}相关的岗位一共有{}个页面----------------".format(key, total_page))
  303. if pages > total_page:
  304. pages = total_page
  305. for i in range(1, pages+1):
  306. try:
  307. job_json = search_job(key, i)
  308. job_data = parse_job_msg(job_json)
  309. write_file("{}_{}".format(key, i),"json", job_json)
  310. write_file(key+"相关岗位","csv", job_data)
  311. exceptExceptionas e:
  312. logging.error("处理-{}-第{}个页面时出错-{}".format(key, i, e))
  313. logging.info("{0}相关岗位信息爬取完毕!".format(key))
  314. @time_logging
  315. def main(key_list, count):
  316. """
  317. :param key_list: 关键字列表
  318. :param count: 页码
  319. :return:
  320. """
  321. logging_init("./config/logconfig.json")
  322. for key in key_list:
  323. print("-----------------开始搜索{}相关的岗位信息------------------".format(key))
  324. parse_key(key, count)
  325. rename_dir()# 为了下次还能够保存数据。更改data 文件夹的名称 为data_{当前的时间戳}
  326. print("列表关键字已爬取完毕!")
  327. logging.info("列表关键字已爬取完毕!")
  328. def rename_dir():
  329. if os.path.exists("./data"):
  330. try:
  331. os.rename("./data","./data_{}".format(int(time.time())))
  332. exceptOSErroras e:
  333. logging.error("{}更改文件夹名称无管理员权限".format(e))
  334. print("-------尝试更改data文件夹名称失败,请手动更改data文件夹名称-【防止下次爬取时数据重写】--------")
  335. def logging_init(path, default_level=logging.INFO):
  336. """
  337. 日志初始化
  338. :param path: 日志配置文件路径
  339. :param default_level: 如果没有日志配置 默认的日志等级
  340. :return:
  341. """
  342. ifnot os.path.exists("./log"):
  343. os.makedirs("./log")
  344. if os.path.exists(path):
  345. # print("ri")
  346. with open(path,"r")as f:
  347. config = json.load(f)
  348. logging.config.dictConfig(config)
  349. logging.getLogger("runtime")
  350. else:
  351. logging.basicConfig(level=default_level)
  352. logging.info("{}不存在,使用默认的日志配置!".format(path))
  353. if __name__ =='__main__':
  354. keywords =["python","java","c#","web前端","c/c++","linux"]
  355. pages =300
  356. main(keywords, pages)

岗位数据综合分析

读取所有岗位信息

  1. import pandas as pd
  2. data_dir ="data_1605456747"
  3. data_read_python = pd.read_csv(
  4. "./{}/csv/python相关岗位.csv".format(data_dir), encoding="utf8")
  5. data_read_csharp = pd.read_csv(
  6. "./{}/csv/c#相关岗位.csv".format(data_dir), encoding="utf8")
  7. data_read_c = pd.read_csv(
  8. "./{}/csv/c_c++相关岗位.csv".format(data_dir), encoding="utf8")
  9. data_read_linux = pd.read_csv(
  10. "./{}/csv/linux相关岗位.csv".format(data_dir), encoding="utf8")
  11. data_read_java = pd.read_csv(
  12. "./{}/csv/java相关岗位.csv".format(data_dir), encoding="utf8")
  13. data_read_web = pd.read_csv(
  14. "./{}/csv/web前端相关岗位.csv".format(data_dir), encoding="utf8")
  15. data_read_python["岗位类型"]="python"
  16. data_read_csharp["岗位类型"]="c#"
  17. data_read_c["岗位类型"]="c/c++"
  18. data_read_linux["岗位类型"]="linux"
  19. data_read_java["岗位类型"]="java"
  20. data_read_web["岗位类型"]="web"

合并数据

  1. # 合并数据并删除重复值
  2. data_sourse = pd.concat([data_read_c, data_read_csharp, data_read_java,
  3. data_read_linux, data_read_python, data_read_web]).drop_duplicates()
  4. len(data_sourse)
  5. data_sourse.columns
  1. Index(['职位名称','最低薪资(千/月)','最高薪资(千/月)','招聘人数','工作经验(年)','最低学历','工作地点',
  2. '工作福利','职位描述和详细条件','公司名称','公司类型','公司人数','公司经营范围','职位详情url',
  3. '公司详情url','发布时间','岗位类型'],
  4. dtype='object')

数据预处理

处理空值和缺失值

  • 工作福利和职位描述都有缺失值或者存在空值 但不做处理 这些数据对于后期分析影响不大
  1. # 查看是否有空值缺失值
  2. data_sourse.isna()
  3. # 可以发现 工作福利和职位描述都有缺失值或者存在空值 但不做处理 这些数据对于后期分析影响不大

重复值处理

重复值在读取数据的时候已经处理过了

  • 处理方法 :直接删除
  • 原因 : 爬虫过程中数据重复写入的问题

异常值处理

  • 由于学历 招聘岗位不同 影响了 薪资高低 所以不属于异常值
  1. import matplotlib.pyplot as plt
  2. import numpy as np
  3. plt.rcParams['font.sans-serif']=["SimHei"]# 防止画图中文乱码
  4. plt.rcParams['axes.unicode_minus']=False
  5. %matplotlib inline
  6. # data_scoure
  7. # data_chart_warehouse = data_scoure.loc[:10,["最低薪资(千/月)","最高薪资(千/月)","招聘人数","工作经验(年)","最低学历","工作地点","公司类型"]]
  8. abnormal_col =["最低薪资(千/月)","最高薪资(千/月)","招聘人数","工作经验(年)",]# 可能出现异常数据的列
  9. data_sourse.boxplot(abnormal_col)# 异常值箱型图
  1. <matplotlib.axes._subplots.AxesSubplotat 0xbc6f6d8>

png

  1. rule_drop =(data_sourse["最低薪资(千/月)"]==
  2. 0)|(data_sourse["最高薪资(千/月)"]==0)# 需要删除的数据
  3. data = data_sourse.drop(data_sourse[rule_drop].index, axis=0, inplace=False)
  4. len(data)
  1. 70679
  1. # 招聘人数 工作经验等与招聘情况有关, 因此不处理
  2. data.boxplot(abnormal_col[0])# 最低薪资异常值箱型图
  1. <matplotlib.axes._subplots.AxesSubplotat 0xea4d978>

png

  1. data.boxplot(abnormal_col[1])# 最高薪资异常值箱型图
  1. <matplotlib.axes._subplots.AxesSubplotat 0x10a52eb8>

png

重置索引

  1. data = data.reset_index()
  1. data.boxplot(abnormal_col[:2])
  1. <matplotlib.axes._subplots.AxesSubplotat 0x11ee7c18>

png

绘制图表

图表数据准备

  1. data_chart = data.loc[:,[
  2. "最低薪资(千/月)","最高薪资(千/月)","招聘人数","工作经验(年)","最低学历","工作地点","公司类型","岗位类型","发布时间"]]
  3. len(data_chart)
  1. 70679

数据总揽

  1. # 按学历分组 求每组的数据总量
  2. chart_pie_data0 = data_chart.groupby("最低学历").count()
  3. chart_pie_data1 = data_chart.groupby("公司类型").count()
  4. chart_pie_data2 = data_chart.groupby("工作地点").count()
  5. chart_pie_data3 = data_chart.groupby("岗位类型").count()
  6. fig, axes = plt.subplots(2,2, figsize=(30,15))
  7. fig.patch.set_facecolor('#384151')
  8. ax0 = axes[0,0]
  9. ax1 = axes[0,1]
  10. ax2 = axes[1,0]
  11. ax3 = axes[1,1]
  12. ax0.pie(chart_pie_data0.iloc[:,0],# 取出求得的每组的数据总量
  13. labels=chart_pie_data0.index,
  14. explode=[0for x in range(len(chart_pie_data0.index))],# 设置每块的
  15. autopct='%3.1f%%',# 数值%
  16. pctdistance=0.7,
  17. textprops=dict(color='w', fontsize=15,),# 标签的字体
  18. labeldistance=1.1,# 锲形块标签的径向距离
  19. startangle=-0# 初始角度
  20. )
  21. ax0.set_title(label="学历要求占比图", loc='center', rotation=0, fontsize=20)
  22. ax0.legend(labels=chart_pie_data0.index, loc="upper left", fontsize=15)
  23. ax1.pie(chart_pie_data1.iloc[:,0],
  24. labels=chart_pie_data1.index,
  25. explode=[0.1,0.5,0.5,0.3,0,0.8,0.1,0.1,0.2,0.1,0.1],
  26. autopct='%3.1f%%',
  27. pctdistance=0.7,
  28. textprops=dict(color='w', fontsize=15,),
  29. startangle=-50
  30. )
  31. ax1.set_title(label="公司类型占比图", loc='center', rotation=0, fontsize=20)
  32. ax1.legend(loc="upper left", fontsize=10)
  33. ax2.pie(chart_pie_data2.iloc[:,0],
  34. labels=chart_pie_data2.index,
  35. explode=[0for x in range(len(chart_pie_data2.index))],
  36. autopct='%3.1f%%',
  37. pctdistance=0.7,
  38. textprops=dict(color='w', fontsize=20,),
  39. labeldistance=1.1,# 锲形块标签的径向距离
  40. startangle=-50
  41. )
  42. ax2.set_title("工作地点占比图", loc='center', rotation=0, fontsize=20)
  43. ax2.legend(loc="lower right", fontsize=13)
  44. ax3.pie(chart_pie_data3.iloc[:,0],
  45. labels=chart_pie_data3.index,
  46. explode=[0for x in range(len(chart_pie_data3.index))],
  47. autopct='%3.1f%%',
  48. pctdistance=0.7,
  49. textprops=dict(color='w', fontsize=20,),
  50. labeldistance=1.1,# 锲形块标签的径向距离
  51. startangle=-50
  52. )
  53. ax3.set_title("岗位类型占比图", loc='center', rotation=0, fontsize=20)
  54. ax3.legend(loc="lower right", fontsize=13)
  55. plt.show()
  56. # len(chart_pie_data0.index)

png

  1. #
  2. chart_bar_data_area = data_chart.groupby("工作地点")
  3. chart_bar_data_area_gw = chart_bar_data_area.apply(
  4. lambda item: item.groupby("岗位类型").count())
  5. chart_bar_data_area_gs = chart_bar_data_area.apply(
  6. lambda item: item.groupby("公司类型").count())
  7. chart_bar_data_area_gz = chart_bar_data_area.apply(
  8. lambda item: item.groupby("岗位类型").mean())
  9. chart_bar_data_area_gz =(
  10. chart_bar_data_area_gz["最低薪资(千/月)"]+ chart_bar_data_area_gz["最高薪资(千/月)"])/2
  11. # 初始化画布 设置背景颜色
  12. fig, ax0 = plt.subplots(figsize=(40,10))
  13. fig.patch.set_facecolor('#384151')
  14. fig, ax1 = plt.subplots(figsize=(40,10))
  15. fig.patch.set_facecolor('#384151')
  16. fig, ax2 = plt.subplots(figsize=(40,10))
  17. fig.patch.set_facecolor('#384151')
  18. # 列索引转换行索引
  19. chart_bar_data_area_gw_count = chart_bar_data_area_gw.iloc[:,1].unstack(
  20. level=1)
  21. x = chart_bar_data_area_gw_count.index
  22. y = chart_bar_data_area_gw_count.columns
  23. x_index =[i for i in range(len(x))]
  24. for j in y:
  25. for i in range(len(x)):
  26. x_index[i]+=0.1
  27. ax0.bar(x_index, chart_bar_data_area_gw_count[j], width=0.05, label=j)
  28. ax0.legend(fontsize=30)
  29. ax0.set_xticks([i+0.35for i in range(len(x)+1)])
  30. ax0.set_xticklabels(x, fontsize=30)
  31. ax0.set_title(label="工作地点-工作岗位数量-分布图", loc='center', rotation=0, fontsize=40)
  32. ax0.set_xlabel("工作地区", fontsize=30)
  33. ax0.set_ylabel("各岗位的数量", fontsize=30)
  34. # 列索引转换行索引
  35. chart_bar_data_area_gs_count = chart_bar_data_area_gs.iloc[:,1].unstack(
  36. level=1)
  37. y = chart_bar_data_area_gs_count.index
  38. x = chart_bar_data_area_gs_count.columns
  39. x_index =[i for i in range(len(x))]
  40. for j in y:
  41. for i in range(len(x)):
  42. x_index[i]+=0.1
  43. ax1.bar(x_index, chart_bar_data_area_gs_count.loc[j], width=0.05, label=j)
  44. ax1.legend(fontsize=30)
  45. ax1.set_xticks([i+0.5for i in range(len(x))])
  46. ax1.set_xticklabels(x, fontsize=30, rotation=-20)
  47. ax1.set_xlabel("公司类型", fontsize=30)
  48. ax1.set_ylabel("各地区的公司数量", fontsize=30)
  49. ax1.set_title(label="公司类型数量-工作地点--分布图", loc='center', rotation=0, fontsize=40)
  50. chart_bar_data_area_gz_mean = chart_bar_data_area_gz.unstack(level=1)
  51. x = chart_bar_data_area_gz_mean.index
  52. y = chart_bar_data_area_gz_mean.columns
  53. x_index =[i for i in range(len(x))]
  54. for j in y:
  55. for i in range(len(x)):
  56. x_index[i]+=0.1
  57. ax2.bar(x_index, chart_bar_data_area_gz_mean[j], width=0.05, label=j)
  58. ax2.legend(y, fontsize=30)
  59. ax2.set_xticks([i+0.35for i in range(len(x)+1)])
  60. ax2.set_xticklabels(x, fontsize=30)
  61. ax2.set_xlabel("工作地区", fontsize=30)
  62. ax2.set_ylabel("各岗位的平均薪资(千/月)", fontsize=30)
  63. ax2.set_title(label="工作地点-平均薪资-分布图", loc='center', rotation=0, fontsize=40)
  64. plt.show()
  65. # chart_bar_data_area_gz.unstack(level=1)

png

png

png

  1. #
  2. date_index = pd.to_datetime(data_chart.loc[:,"发布时间"].values)
  3. # date_index = date_index
  4. # date_index = data_chart.groupby("发布时间").mean()
  5. chart_date_data = data_chart.set_index(date_index)
  6. fig = plt.figure('', figsize=(20,5))
  7. fig.patch.set_facecolor('#384151')
  8. plt.margins(0)
  9. plt.title("时间-薪资--趋势图")
  10. plt.plot(chart_date_data["最低薪资(千/月)"])
  11. plt.plot(chart_date_data["最高薪资(千/月)"])
  12. plt.legend(["最低薪资(千/月)","最高薪资(千/月)"])
  13. plt.xlabel("数据抓取时间")
  14. plt.ylabel("薪资")
  15. plt.show()

png

工作地区对于招聘情况的影响

  1. data_group_area = data_chart.groupby("工作地点")
  2. data_group_area_mean = data_group_area.mean()
  3. chart_name ="平均指标-地区-柱状图"
  4. fig = plt.figure(chart_name, figsize=(20,6))
  5. fig.patch.set_facecolor('#384151')
  6. plt.title(chart_name)
  7. x = data_group_area_mean.index
  8. y1 = data_group_area_mean["最低薪资(千/月)"]
  9. y2 = data_group_area_mean["最高薪资(千/月)"]
  10. y3 = data_group_area_mean["招聘人数"]
  11. y4 = data_group_area_mean["工作经验(年)"]
  12. plt.xlabel("工作地点")
  13. plt.ylabel("平均数据")
  14. width =0.05
  15. plt.bar([i+0.1for i in range(len(x))], y1, width=width)
  16. plt.bar([i+0.2for i in range(len(x))], y2, width=width)
  17. plt.bar([i+0.3for i in range(len(x))], y3, width=width)
  18. plt.bar([i+0.4for i in range(len(x))], y4, width=width)
  19. plt.legend(data_group_area_mean.columns, loc="upper right")
  20. plt.xticks([i+.25for i in range(len(x)+1)], x)
  21. plt.show()

png

工作经验对于招聘情况的影响

  1. data_group_bg = data_chart.groupby("工作经验(年)")
  2. data_group_bg_mean = data_group_bg.mean()
  3. fig = plt.figure('', figsize=(20,10))
  4. fig.patch.set_facecolor('#384151')
  5. chart_name ="工作经验-线型图"
  6. plt.title(chart_name)
  7. x = data_group_bg_mean.index
  8. y1 = data_group_bg_mean["最低薪资(千/月)"]
  9. y2 = data_group_bg_mean["最高薪资(千/月)"]
  10. y3 = data_group_bg_mean["招聘人数"]
  11. plt.xlabel("工作经验(年)")
  12. plt.ylabel("平均指标")
  13. plt.plot(x, y1)
  14. plt.plot(x, y2)
  15. plt.plot(x, y3)
  16. plt.legend(data_group_area_mean.columns, loc="upper right")
  17. plt.show()

png

公司类型对于招聘情况的影响

  1. # 通过公司类型进行分组 然后求每组平均值
  2. data_group_gs = data_chart.groupby("公司类型").mean()
  3. chart_name ="公司类型-柱状图"
  4. fig = plt.figure(chart_name, figsize=(20,8))
  5. fig.patch.set_facecolor('#384151')
  6. plt.title(chart_name)
  7. x = data_group_gs.index
  8. y1 = data_group_gs["最低薪资(千/月)"]
  9. y2 = data_group_gs["最高薪资(千/月)"]
  10. y3 = data_group_gs["招聘人数"]
  11. y4 = data_group_gs["工作经验(年)"]
  12. plt.xlabel("公司类型")
  13. plt.ylabel("平均指标")
  14. width =0.05
  15. plt.bar([i+0.1for i in range(len(x))], y1, width=width)
  16. plt.bar([i+0.2for i in range(len(x))], y2, width=width)
  17. plt.bar([i+0.3for i in range(len(x))], y3, width=width)
  18. plt.bar([i+0.4for i in range(len(x))], y4, width=width)
  19. plt.legend(data_group_area_mean.columns, loc="upper right")
  20. plt.xticks([i+.25for i in range(len(x)+1)], x)
  21. # plt.savefig("./test.png")
  22. plt.show()

png

学历对于招聘的影响

  1. data_group_bg = data_chart.groupby("最低学历").mean()
  2. chart_name ="学历-柱状图"
  3. fig = plt.figure(chart_name, figsize=(20,8))
  4. fig.patch.set_facecolor('#384151')
  5. plt.title(chart_name)
  6. x = data_group_bg.index
  7. y1 = data_group_bg["最低薪资(千/月)"]
  8. y2 = data_group_bg["最高薪资(千/月)"]
  9. y3 = data_group_bg["招聘人数"]
  10. y4 = data_group_bg["工作经验(年)"]
  11. plt.xlabel("学历")
  12. plt.ylabel("平均指标")
  13. width =0.05
  14. plt.bar([i+0.1for i in range(len(x))], y1, width=width)
  15. plt.bar([i+0.2for i in range(len(x))], y2, width=width)
  16. plt.bar([i+0.3for i in range(len(x))], y3, width=width)
  17. plt.bar([i+0.4for i in range(len(x))], y4, width=width)
  18. plt.legend(data_group_area_mean.columns, loc="upper right")
  19. plt.xticks([i+.25for i in range(len(x)+1)], x)
  20. # plt.savefig("./test.png")
  21. plt.show()

png

绘制词云

数据准备

  1. data_word = data[["职位名称","职位描述和详细条件","工作福利"]]
  2. data_word = data_word.dropna()
  3. data_word

封装jieba分词以及生成词云的函数

  1. #
  2. from wordcloud importWordCloud
  3. import jieba
  4. import re
  5. def parse_text(text_list):
  6. """
  7. 1.通过正则表达式删除数字标点符号空格回车
  8. 2.删除停用词
  9. """
  10. re_list =[re.compile(r"[^a-zA-Z]\d+"), re.compile(r"\s+"),
  11. re.compile(r"[^0-9A-Za-z\u4e00-\u9fa5]")]
  12. for reg in re_list:
  13. text = reg.sub("","".join(text_list))
  14. return text
  15. def stopwordslist(filepath):
  16. """读取无意义词汇-(停用词列表)"""
  17. stopwords =[line.strip()for line in open(
  18. filepath,'r', encoding="utf8").readlines()]
  19. return stopwords
  20. def seg_sentence(sentence, ignorelist):
  21. """删除停用词 返回切割后的str"""
  22. stopwords = stopwordslist('./baidu_tingyong.txt')
  23. for stop in stopwords+ignorelist:
  24. sentence = sentence.replace(stop,"")
  25. outstr =''
  26. sentence_seged = jieba.cut(sentence.strip())
  27. for word in sentence_seged:
  28. if word !='\t':
  29. outstr += word
  30. outstr +=" "
  31. return outstr
  32. def cut_words(text_list, ignore_list=[]):
  33. text = list(set(text_list))
  34. return seg_sentence(parse_text(text), ignore_list)
  35. def word_img(cut_words, title, ignorelist):
  36. fig = plt.figure("", figsize=(15,10))
  37. fig.patch.set_facecolor('#384151')
  38. wordcloud =WordCloud(
  39. font_path="./fonts/simkai.ttf",# 字体的路径
  40. width=1000, height=600,# 设置宽高
  41. background_color='white',# 图片的背景颜色
  42. # max_font_size=100,
  43. # min_font_size=20,
  44. # max_words=300,
  45. # font_step=2,
  46. stopwords={x for x in ignorelist}
  47. )
  48. plt.title(title, fontsize=20)
  49. plt.imshow(wordcloud.generate(cut_words), interpolation='bilinear')
  50. plt.axis("off")

岗位要求词云

  1. text_dec = data_word["职位描述和详细条件"].values
  2. # 自定义停用词
  3. ignore_dec_list =["工作","能力","开发","熟悉","优先","精通","熟练","优先","负责","公司","岗位职责","用户","技术","沟通"
  4. "软件","以上","学历","专业","产品","计算机","项目","具备","相关","服务","研发","管理","参与","精神",
  5. "分析","岗位","理解","需求","独立","解决","业务","文档","数据","编写","大专","本科","团队","合作","科",
  6. "上","协调","详细","设计","职","求","基础","扎实","模块","系统","学习","工","具","平台","知识",
  7. "包括","压力","内容","研究","周末","双","休","软件","描述","国家","节假日","法定","方法","主流",
  8. "于都","年","验","控制","流程"]
  9. text_dec = cut_words(text_dec, ignore_dec_list)
  1. Building prefix dict from the default dictionary ...
  2. Loading model from cache F:\bglb\AppData\Temp\jieba.cache
  3. Loading model cost 0.800 seconds.
  4. Prefix dict has been built successfully.
  1. ignorelist =["流程","程序","大","数","库"]
  2. word_img(text_dec,"岗位要求词云", ignorelist)

png

岗位福利词云

  1. text_fl = data_word["工作福利"].values
  2. text_fl = cut_words(text_fl)
  1. ignorelist =["定期","工作"]
  2. word_img(text_fl,"岗位福利词云", ignorelist)

png

  1. text_name = data_word["职位名称"].values
  2. text_name = cut_words(text_name)
  1. word_img(text_name,"职位名称词云",[])

png

posted @ 2020-11-25 00:22  半根蓝白  阅读(263)  评论(0)    收藏  举报