1 #爬虫 :模拟浏览器发送请求,下载网页代码,只提取有用的数据,存放于数据库或者文件中
2 #方案:python语法来实现,requests库发请求解析请求,excel表存储数据
3 #爬虫操作流程:1.模拟浏览器发送请求
4 #2.下载网页代码
5 #3.只提取有用的数据
6 #4.存放于数据库或者文件中
7
8 #--------------------------------------------------------------------------------
9 #excel初始化:新建一个excel文件,创建个子表
10 #导入模块
11 import requests,re
12 import xlwt
13 workBook = xlwt.Workbook(encoding='uft-8')#此时还在缓存里
14 #创建子表
15 workSheet = workBook.add_sheet('51job')
16 #创建表头
17 colName = ['岗位名称','公司名称','地点','薪资','发布时间']
18 #把表头写进表格中
19 for one in range(0,len(colName)):#0,1,2,3,4
20 #写进单元格
21 workSheet.write(0,one,colName[one]) #行号,列号,内容
22 #--------------------------------------------------------------------------------
23
24 #获取网站的页数,定义个函数
25 # <span class="td">共7页,到第</span>
26 def get_pagenum():
27 web_url = 'https://search.51job.com/list/080200,000000,0000,00,9,07,%25E8%25BD%25AF%25E4%25BB%25B6%25E6%25B5%258B%25E8%25AF%2595,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
28 resp = requests.get(web_url) # 构建请求
29 resp.encoding = 'gbk' # 解决乱码
30 pages = int(re.findall('<span class="td">共(.*?)页,到第</span>', resp.text, re.S)[0])
31 return pages
32 #--------------------------------------------------------------------------------
33
34 #目标:获取符合要求的岗位
35 raw = 1 #内容的初始化行=1 ,因为有表头了
36 #1.构建请求
37 for one in range(1,get_pagenum()+1):
38 web_url = f'https://search.51job.com/list/080200,000000,0000,00,9,07,%25E8%25BD%25AF%25E4%25BB%25B6%25E6%25B5%258B%25E8%25AF%2595,2,{one}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
39 resp = requests.get(web_url) # 构建请求
40 resp.encoding = 'gbk' #解决乱码
41 # print(resp.request.headers) #请求头
42 # print(resp.request.body) #请求体
43 #2.解析响应数据
44 print(resp.text)
45 #3.提取有效数据
46 #从响应数据中提取有效数据。re.S #使.匹配包括换行符在内的所有字符
47 info = re.findall('<div class="el">(.*?)</div>',resp.text,re.S) #数据中有很多换行符,需要加re.S
48 for line in info:
49 #1.获取岗位名称
50 temp = re.findall('<a target="_blank" title="(.*?)" href',line,re.S)
51 # 因为re.findall返回值是列表,所以要取值就必须写下标
52 jobName = temp[0].strip()
53 workSheet.write(raw, 0, jobName)
54 # 2.获取公司名称,#因为re.findall返回值是列表,所以要取值就必须写下标
55 company = temp[1].strip()
56 workSheet.write(raw, 1, company)
57 # 3.获取地点,#因为re.findall返回值是列表,所以要取值就必须写下标
58 adress = re.findall('<span class="t3">(.*?)</span>',line,re.S)[0]
59 workSheet.write(raw, 2, adress)
60 # 4. 获取薪资,#因为re.findall返回值是列表,所以要取值就必须写下标
61 salary = re.findall('<span class="t4">(.*?)</span>', line, re.S)[0]
62 workSheet.write(raw, 3, salary)
63 #5.获取发布时间
64 jobTime = re.findall('<span class="t5">(.*?)</span>', line, re.S)[0]
65 workSheet.write(raw, 4, jobTime)
66 raw +=1
67 print(jobName,company,adress,salary,jobTime)
68
69 #--------------------------------------------------------------------------------
70
71 #4.存储数据
72 #保存
73 workBook.save('D:\\51.job.xls')
74
75 #--------------------------------------------------------------------------------
76
77
78
79
80
81
82
83
84 '''
85 <div class="el">
86 <p class="t1 ">
87 <em class="check" name="delivery_em" onclick="checkboxClick(this)"></em>
88 <input class="checkbox" type="checkbox" name="delivery_jobid" value="85295957" jt="0" style="display:none">
89 <span>
90 <a target="_blank" title="软件测试工程师" href="https://jobs.51job.com/hangzhou-yhq/85295957.html?s=01&t=0" onmousedown="">
91 软件测试工程师 </a>
92 </span>
93 </p>
94 <span class="t2"><a target="_blank" title="杭州老板电器股份有限公司" href="https://jobs.51job.com/all/co2322348.html">杭州老板电器股份有限公司</a></span>
95 <span class="t3">杭州-余杭区</span>
96 <span class="t4">10-15万/年</span>
97 <span class="t5">03-21</span>
98 </div>
99
100 '''