爬取多页数据

'''
@author:zl
@contact:
@site: https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,2.html
'''
# _*_ coding:utf-8 _*_
import requests
from bs4 import BeautifulSoup
import re
import time
from pymongo import MongoClient
import xlwt
headers = {
    'user-agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" ,
    'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    'accept-encoding': "gzip, deflate, br",
    'accept-language': "zh-CN,zh;q=0.9",
    'cache-control': "max-age=0",
    'upgrade-insecure-requests': "1",
    'Connection': 'keep-alive',
    'Host': "search.51job.com",

}
# 获取源码
def get_content(page):
    url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,'+str(page)+'.html'
    req = requests.get(url,headers=headers)
    html = req.content.decode('gbk')
    return html
# 获取字段
def get(html):
    reg = re.compile(r'<p class="t1 ">.*?<a target="_blank" title="(.*?)" .*?<span class="t2"><a target="_blank" title="(.*?)" .*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>',re.S)
    items = re.findall(reg,html)
    return items
# 爬到的内容写入excel
def excel_write(items,index):
    for item in items: # 职位信息
        for i in range(0,5):
            print(item[i])
            ws.write(index, i, item[i])  # 行，列，数据
        index+=1
if __name__ == '__main__':
    newTable = "test.xls"  # 表格名称
    wb = xlwt.Workbook(encoding='utf-8')  # 创建excel文件，声明编码
    ws = wb.add_sheet('sheet1')  # 创建表格
    headData = ['招聘职位', '公司', '地址', '薪资', '日期']  # 表头信息
    for colnum in range(0,5):
        ws.write(0,colnum,headData[colnum],xlwt.easyxf('font: bold on'))
    # 多页处理，下载到文件
    for each in range(1,10):
        index = (each-1)*50+1
        excel_write(get(get_content(each)),index)
    wb.save(newTable)
posted @ 2018-06-20 12:04 rjm123456 阅读(798) 评论(0) 收藏举报
刷新页面返回顶部
hamburger

爬取多页数据

公告