'''
@author:zl
@contact:
@site: https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,2.html
'''
# _*_ coding:utf-8 _*_
import requests
from bs4 import BeautifulSoup
import re
import time
from pymongo import MongoClient
import xlwt
headers = {
'user-agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" ,
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
'accept-encoding': "gzip, deflate, br",
'accept-language': "zh-CN,zh;q=0.9",
'cache-control': "max-age=0",
'upgrade-insecure-requests': "1",
'Connection': 'keep-alive',
'Host': "search.51job.com",
}
# 获取源码
def get_content(page):
url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,'+str(page)+'.html'
req = requests.get(url,headers=headers)
html = req.content.decode('gbk')
return html
# 获取字段
def get(html):
reg = re.compile(r'<p class="t1 ">.*?<a target="_blank" title="(.*?)" .*?<span class="t2"><a target="_blank" title="(.*?)" .*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>',re.S)
items = re.findall(reg,html)
return items
# 爬到的内容写入excel
def excel_write(items,index):
for item in items: # 职位信息
for i in range(0,5):
print(item[i])
ws.write(index, i, item[i]) # 行,列,数据
index+=1
if __name__ == '__main__':
newTable = "test.xls" # 表格名称
wb = xlwt.Workbook(encoding='utf-8') # 创建excel文件,声明编码
ws = wb.add_sheet('sheet1') # 创建表格
headData = ['招聘职位', '公司', '地址', '薪资', '日期'] # 表头信息
for colnum in range(0,5):
ws.write(0,colnum,headData[colnum],xlwt.easyxf('font: bold on'))
# 多页处理,下载到文件
for each in range(1,10):
index = (each-1)*50+1
excel_write(get(get_content(each)),index)
wb.save(newTable)