爬取智联招聘岗位并根据描述生成词云
转自:https://blog.csdn.net/qq_36381299/article/details/80634451
前言:
根据搜索相关的职位,获取职位数量,由职位数量得到职位相关页码链接,再由相关页码链接获得每个职位链接,最后由职位链接获取详细的职位描述。以上获得链接和职位描述由正则表达式完成。
环境:win7 、pycharm、python2、
所用到的库:urllib2 、 re、urllib、time 、jieba、matplotlib、wordcloud、numpy、PIL
文件组成:
main.py ----主要函数文件包括获取页码链接、获取每页职位链接、获取职位描述、爬取信息保存职位描述为txt文本
zhaopin_wordcloud.py ----根据保存文本信息生成词云
mysh.ttf ----为生成词云准备的字体文件
info.txt ----保存职位描述为txt文本
代码如下:
main.py
#coding:utf-8
import urllib2
import urllib
import re
import time
#获取页码链接
def getpagelist(name):
url = "https://sou.zhaopin.com/jobs/searchresult.ashx?"
# 模拟浏览器头部
headers = {
"User-Agent": "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/60.0.1"
}
word = {"kw": name} # 相关职位
word = urllib.urlencode(word) # 编码成字符串
url = url + word # 拼接url
request = urllib2.Request(url, headers=headers) # 发起请求
request.add_header("Connection", "keep-alive") # 一直活着
response = urllib2.urlopen(request) # 打开请求
data = response.read() # 读取数据
restr = "<em>(\\d+)</em>" # 正则表达式
regex = re.compile(restr, re.IGNORECASE)
mylist = regex.findall(data) # 寻找页面所有信息
numbers = mylist[0]
numbers = eval(numbers)#将职位数转化为数据
zhao_numbers = numbers # 职位的数量
zhao_list = [] # 空列表
print "++++++++++++++++"
print zhao_numbers
# for i in range(zhao_numbers//50):
# print
if zhao_numbers % 60 == 0: # 生成页面列表
for i in range(zhao_numbers // 60): #智联招聘每页有60个职位 职位总数整除60就是有几页
zhao_list.append( #添加链接到列表
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B5%8E%E5%8D%97&kw=" + name + "&p=" + str(i + 1))
else:
for i in range(zhao_numbers // 60 + 1):
zhao_list.append(
"http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B5%8E%E5%8D%97&kw=" + name + "&p=" + str(i + 1))
return zhao_list
#获取每页中的职位链接
def get_url_list(url):
#模拟浏览器
headers = {
"User-Agent": "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/60.0.1"
}
request = urllib2.Request(url, headers=headers) # 发起请求
request.add_header("Connection", "keep-alive") # 一直活着
response = urllib2.urlopen(request) # 打开请求
data = response.read() # 读取数据
#print data
restr = ur"<a style=\"font-weight: bold\" par=\"ssidkey=y&ss=201&ff=03&sg=.*?;so=.*?\" href=\"(\bhttp[\s\S]..\bjobs.\w+.\w+.\w+.\w+)" # 正则表达式,()匹配内容
regex = re.compile(restr, re.IGNORECASE)
mylist = regex.findall(data) # 寻找页面所有信息
urllist = []
for list in mylist:
urllist.append(list)
return urllist
'''#错误的代码
restr = "http://jobs.zhaopin.com/([\s\S]*?)" # 正则表达式
regex = re.compile(restr, re.IGNORECASE)
tableurllist = regex.findall(tablestr) # 寻找页面所有信息
urllist = []
for list in tableurllist:
urllist.append("http://jobs.zhaopin.com/"+list+".htm")
return urllist
'''
#获取职位描述信息
def get_zhiwei(url):
# 模拟浏览器
headers = {
"User-Agent": "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/60.0.1"
}
request = urllib2.Request(url, headers=headers) # 发起请求
request.add_header("Connection", "keep-alive") # 一直活着
response = urllib2.urlopen(request) # 打开请求
data = response.read() # 读取数据
restr = "<div class=\"tab-inner-cont\">([\s\S]*?)<b>" # 正则表达式
regex = re.compile(restr, re.IGNORECASE)
info = regex.findall(data)
info_page = info[0].decode('utf-8').strip().replace("<p>", "").replace("</p>", "")#去除标签替换为空格
#info_page = info[0].decode('utf-8')
return info_page
#写入文件
def wirtetxt(info):
file = open('info.txt',"ab+")
file.write((info).encode('utf-8'))
file.close()
#file.write((info).encode('utf-8'))
#num=1
#savefilepath = "workinfo.txt"
#savefile =open(savefilepath,"wb")
zhao_list = getpagelist("python")#页码链接
for line in zhao_list:
#print line # 打印链接
urllist= get_url_list(line)
for line1 in urllist:
#num+=1
#print line1
#print "共有%d"%num+"个链接"
time.sleep(1)
workstr= get_zhiwei(line1)
print workstr
print "正在写入...."
wirtetxt(workstr)
#savefile.write((workstr).encode("utf-8"))
#savefile.close()
zhaopin_wordcloud.py
#coding:utf-8
import jieba
import matplotlib
import matplotlib.pyplot as plt
import wordcloud
from wordcloud import WordCloud,ImageColorGenerator,STOPWORDS#词云
import numpy as np #科学计算
from PIL import Image #处理图片
#打开文本
testfile = open("info.txt").read()
#数据清洗,去除不重要词语
testfile = testfile.\
replace("family","").replace("span","").replace("font","").replace("color","").replace("14px","").replace("rgb","").\
replace("size","").replace("br","").replace("0px","").replace("宋体","").replace("margin","").replace("line","").\
replace("style","").replace("line","").replace("height","").replace("white","").\
replace("熟悉","").replace("nbsp","").replace("background","").replace("normal","").replace("margin","").replace("平台","").\
replace("space","").replace("padding","").replace("bottom","").replace("top","").\
replace("技术","").replace("工作","").replace("text","").replace("indent","").replace("letter","").replace("stretch","").\
replace("25px","").replace("应用","").replace("simsun","").replace("strong","").\
replace("系统","").replace("Yahei","").replace("indent","").replace("left","").replace("data","").replace("熟练","").\
replace("Calii","").replace("Microsoft","").replace("Sans","").replace("div","").replace("serif","").replace("19px","").\
replace("设计","").replace("公司","").replace("开发","").replace("了解","").\
replace("熟悉","").replace("进行","").replace("仿宋","").replace("负责","").replace("border","").replace("专业","").\
replace("space","").replace("padding","").replace("优先","").replace("top","").\
replace("技术","").replace("工作","").replace("研发","").replace("要求","").replace("任职","").replace("相关","").\
replace("岗位职责","").replace("计算","").replace("上学","").replace("学历","")
wordlist = jieba.cut(testfile,cut_all=True)#切割
space_list =" ".join(wordlist)#链接词语
backgroud = np.array(Image.open("1.jpg"))#背景图片
mywordcloud = WordCloud(background_color="white",#背景颜色
mask=backgroud,#写字用的背景图,从背景提取颜色
stopwords=STOPWORDS,#停止的默认词语
font_path="msyh.ttf",#字体
max_font_size=100,#字体大小
random_state=30,#词云数量
scale=1).generate(space_list)#生成词云
image_color = ImageColorGenerator(backgroud)#生成词云的颜色
plt.imshow(mywordcloud)#显示词云
plt.axis("off")
plt.show()
main.py 运行效果:
zhaopin_wordcloud.py 运行生成词云图片:


浙公网安备 33010602011771号