爬取智联招聘岗位并根据描述生成词云

转自:https://blog.csdn.net/qq_36381299/article/details/80634451

前言:

根据搜索相关的职位,获取职位数量,由职位数量得到职位相关页码链接,再由相关页码链接获得每个职位链接,最后由职位链接获取详细的职位描述。以上获得链接和职位描述由正则表达式完成。

环境:win7 、pycharm、python2、

所用到的库:urllib2 、 re、urllib、time 、jieba、matplotlib、wordcloud、numpy、PIL

文件组成:

   main.py ----主要函数文件包括获取页码链接、获取每页职位链接、获取职位描述、爬取信息保存职位描述为txt文本

   zhaopin_wordcloud.py ----根据保存文本信息生成词云

   mysh.ttf ----为生成词云准备的字体文件

   info.txt ----保存职位描述为txt文本

代码如下:

main.py

#coding:utf-8
import urllib2
import urllib
import re
import time
#获取页码链接
def getpagelist(name):
    url = "https://sou.zhaopin.com/jobs/searchresult.ashx?"
    # 模拟浏览器头部
    headers = {
        "User-Agent": "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/60.0.1"
    }
    word = {"kw": name}  # 相关职位
    word = urllib.urlencode(word)  # 编码成字符串
    url = url + word  # 拼接url
    request = urllib2.Request(url, headers=headers)  # 发起请求
    request.add_header("Connection", "keep-alive")  # 一直活着
    response = urllib2.urlopen(request)  # 打开请求
    data = response.read()  # 读取数据
    restr = "<em>(\\d+)</em>"  # 正则表达式
    regex = re.compile(restr, re.IGNORECASE)
    mylist = regex.findall(data)  # 寻找页面所有信息
    numbers = mylist[0]
    numbers = eval(numbers)#将职位数转化为数据
    zhao_numbers = numbers  # 职位的数量
    zhao_list = []  # 空列表
    print "++++++++++++++++"
    print zhao_numbers
    # for i in range(zhao_numbers//50):
    #   print 
    if zhao_numbers % 60 == 0:  # 生成页面列表
        for i in range(zhao_numbers // 60): #智联招聘每页有60个职位 职位总数整除60就是有几页
            zhao_list.append(    #添加链接到列表
                "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B5%8E%E5%8D%97&kw=" + name + "&p=" + str(i + 1))
    else:
        for i in range(zhao_numbers // 60 + 1):
            zhao_list.append(
                "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B5%8E%E5%8D%97&kw=" + name + "&p=" + str(i + 1))
    return zhao_list
 
#获取每页中的职位链接
def get_url_list(url):
    #模拟浏览器
    headers = {
        "User-Agent": "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/60.0.1"
    }
    request = urllib2.Request(url, headers=headers)  # 发起请求
    request.add_header("Connection", "keep-alive")  # 一直活着
    response = urllib2.urlopen(request)  # 打开请求
    data = response.read()  # 读取数据
    #print data
    restr = ur"<a style=\"font-weight: bold\" par=\"ssidkey=y&ss=201&ff=03&sg=.*?;so=.*?\" href=\"(\bhttp[\s\S]..\bjobs.\w+.\w+.\w+.\w+)"  # 正则表达式,()匹配内容
    regex = re.compile(restr, re.IGNORECASE)
    mylist = regex.findall(data)  # 寻找页面所有信息
    urllist = []
    for list in mylist:
        urllist.append(list)
    return urllist
'''#错误的代码
    restr = "http://jobs.zhaopin.com/([\s\S]*?)"  # 正则表达式
    regex = re.compile(restr, re.IGNORECASE)
    tableurllist = regex.findall(tablestr)  # 寻找页面所有信息
    urllist = []
    for list in tableurllist:
        urllist.append("http://jobs.zhaopin.com/"+list+".htm")
    return urllist
'''
#获取职位描述信息
def get_zhiwei(url):
    # 模拟浏览器
    headers = {
        "User-Agent": "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/60.0.1"
    }
    request = urllib2.Request(url, headers=headers)  # 发起请求
    request.add_header("Connection", "keep-alive")  # 一直活着
    response = urllib2.urlopen(request)  # 打开请求
    data = response.read()  # 读取数据
    restr = "<div class=\"tab-inner-cont\">([\s\S]*?)<b>"  # 正则表达式
    regex = re.compile(restr, re.IGNORECASE)
    info = regex.findall(data)
    info_page = info[0].decode('utf-8').strip().replace("<p>", "").replace("</p>", "")#去除标签替换为空格
    #info_page = info[0].decode('utf-8')
    return info_page
#写入文件
 
def wirtetxt(info):
    file = open('info.txt',"ab+")
    file.write((info).encode('utf-8'))
    file.close()
#file.write((info).encode('utf-8'))
    
#num=1 
#savefilepath = "workinfo.txt"
#savefile =open(savefilepath,"wb") 
zhao_list = getpagelist("python")#页码链接
for line in zhao_list:
    #print line  # 打印链接
    urllist= get_url_list(line)
    for line1 in urllist:
        #num+=1
        #print line1
        #print "共有%d"%num+"个链接"
        time.sleep(1)
        workstr= get_zhiwei(line1)
        print workstr
        print "正在写入...."
        wirtetxt(workstr)
        #savefile.write((workstr).encode("utf-8"))
 
#savefile.close()

zhaopin_wordcloud.py

#coding:utf-8
import jieba
import matplotlib
import matplotlib.pyplot as plt 
import wordcloud
from wordcloud import WordCloud,ImageColorGenerator,STOPWORDS#词云
import numpy as np #科学计算
from PIL import Image #处理图片
#打开文本
testfile = open("info.txt").read()
 
#数据清洗,去除不重要词语
testfile = testfile.\
replace("family","").replace("span","").replace("font","").replace("color","").replace("14px","").replace("rgb","").\
replace("size","").replace("br","").replace("0px","").replace("宋体","").replace("margin","").replace("line","").\
replace("style","").replace("line","").replace("height","").replace("white","").\
replace("熟悉","").replace("nbsp","").replace("background","").replace("normal","").replace("margin","").replace("平台","").\
replace("space","").replace("padding","").replace("bottom","").replace("top","").\
replace("技术","").replace("工作","").replace("text","").replace("indent","").replace("letter","").replace("stretch","").\
replace("25px","").replace("应用","").replace("simsun","").replace("strong","").\
replace("系统","").replace("Yahei","").replace("indent","").replace("left","").replace("data","").replace("熟练","").\
replace("Calii","").replace("Microsoft","").replace("Sans","").replace("div","").replace("serif","").replace("19px","").\
replace("设计","").replace("公司","").replace("开发","").replace("了解","").\
replace("熟悉","").replace("进行","").replace("仿宋","").replace("负责","").replace("border","").replace("专业","").\
replace("space","").replace("padding","").replace("优先","").replace("top","").\
replace("技术","").replace("工作","").replace("研发","").replace("要求","").replace("任职","").replace("相关","").\
replace("岗位职责","").replace("计算","").replace("上学","").replace("学历","")
wordlist = jieba.cut(testfile,cut_all=True)#切割
space_list =" ".join(wordlist)#链接词语
backgroud = np.array(Image.open("1.jpg"))#背景图片
mywordcloud = WordCloud(background_color="white",#背景颜色
                        mask=backgroud,#写字用的背景图,从背景提取颜色
                        stopwords=STOPWORDS,#停止的默认词语
                        font_path="msyh.ttf",#字体
                        max_font_size=100,#字体大小
                       random_state=30,#词云数量
                        scale=1).generate(space_list)#生成词云
image_color = ImageColorGenerator(backgroud)#生成词云的颜色
plt.imshow(mywordcloud)#显示词云
plt.axis("off")
plt.show()

main.py 运行效果:

zhaopin_wordcloud.py  运行生成词云图片:

posted @ 2019-12-10 15:28  PythonGirl  阅读(527)  评论(0)    收藏  举报