爬取boss直聘招聘信息带有bug

import requests,time,json,csv
import pymysql
from urllib.parse import urlencode
from bs4 import BeautifulSoup

list_data = []
csv_data=[]
# 发送请求页面并存下数据
def spider_boss(url, data,):
req = requests.post(url, headers=headers, data=data, timeout=5,).text
list = json.loads(req)
if 'jobList' not in list['zpData'].keys():
print('cookie过期请更换...')
exit()
try:
for i in list['zpData']['jobList']:
title = i['jobName'] # 职业信息
jobDegree = i['jobDegree'] # 学历
brandName = i['brandName'] # 公司名称
areaDistrict = i['areaDistrict'] + i['businessDistrict'] # 地区加地址
salaryDesc = i['salaryDesc'] # 工资
jobExperience = i['jobExperience'] # 经验
urladdr = 'https://www.zhipin.com/job_detail/' + i['encryptJobId'] + '.html' # url地址
skills = ','.join(i['skills']) # 工作内容
welfareList = ','.join(i['welfareList']) # 公司福利
mag = {
'title': title,
'jobDegree': jobDegree,
'brandName': brandName,
'areaDistrict': areaDistrict,
'salaryDesc': salaryDesc,
'jobExperience': jobExperience,
'urladdr': urladdr,
'skills': skills,
'welfareList': welfareList,
}
list_data.append(mag)
print(list_data)
return list_data
except:
print('请检查cookies是否过期...')
time.sleep(2)
spider_boss(url, data)


#储存mysql数据库
def server(data):
#连接数据库
con = pymysql.connect(host='127.0.0.1', user='root', password='root', db='boss', charset='utf8')
cursor = con.cursor()
for conter in data:
try:
sql = "INSERT INTO heji(title,jobDegree,brandName,areaDistrict,salaryDesc,jobExperience,urladdr,skills,welfareList) VALUES('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (
conter['title'], conter['jobDegree'], conter['brandName'], conter['areaDistrict'], conter['salaryDesc'],
conter['jobExperience'], conter['urladdr'], conter['skills'], conter['welfareList'])
cursor.execute(sql)
con.commit()
except:
con.rollback()
print('请检查sql语句..')
print('数据库写入成功...')
#完成后关闭数据库
con.close()

def server_csv(data):
#遍历数据,处理成存储数据
for conter in data:
csv_list = [conter['title'], conter['jobDegree'], conter['brandName'], conter['areaDistrict'],conter['salaryDesc'], conter['jobExperience'], conter['urladdr'], conter['skills'],conter['welfareList']]
csv_data.append(csv_list)
with open('求职信息.csv','w',encoding='utf-8',newline='') as f:
writer=csv.writer(f)
writer.writerow(['职业', '学历', '公司名称', '地区', '工资', '经验', 'url地址', '工作内容', '公司福利'])
writer.writerows(csv_data)
print('cvs表格写入成功..')

def strat(url,data):
#开启抓取页面
list_data = spider_boss(url,data)
#储存mysql数据库
server(list_data)
# 存储csv表格
server_csv(list_data)

if __name__ == '__main__':
#目标网站url
url = 'https://www.zhipin.com/wapi/zpgeek/recommend/job/list.json'
#cookie更改 bug中,每次查询5次需要手动更换
cookie='lastCity=101280100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1620300454; wt2=DGYNTwimCoXqwtqG0OOhtIK-83huFIIq0ojwNVqE4S9zEJUvL9jIT_i-Bpp2Ywvg1uzq6xL9ISxkK3mWPiuo_wg~~; _bl_uid=OIkUzos5cUbttt2O0oLtxO6sOkh3; __fid=d60538005c9afb998b6d815b2f9a381b; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1620361396; __c=1620300454; __l=l=%2Fwww.zhipin.com%2Fweb%2Fgeek%2Frecommend%3Fka%3Dheader-username&s=3&friend_source=0&s=3&friend_source=0; __a=89029989.1620300454..1620300454.52.1.52.52; geek_zp_token=V1QNslFuz_21xqVtRvzR0bLyOw7j7TxS0~; __zp_stoken__=3ac4cMT5GbxZwTFwEO3t%2FMH9udkY6L2NQbWxKBllkYjB0HTBGc3NgeF5HHHdBIjZ5Wl0HUBF%2BBwYrLTU2IS9TBVItewtYLj5mezdvWQRBIwEfTklDWAB8FQ8uU11WRF5can4XbRstDmxcYAYW'
#设置headers请求头信息
headers = {
"referer": "https://www.zhipin.com/web/geek/recommend?expectId=136314657&sortType=1&page=4&districtCode=0&cityCode=101280100",
"cookie":cookie,
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
}
#批量遍历页面url
for num in range(1,30):
data = {
"expectId": "136314657",
"sortType": 1,
"page": num,
"salary": "",
"payType": "",
"degree": "",
"experience": "",
"stage": "",
"scale": "",
"districtCode": 0,
"businessCode": "",
}

data = urlencode(data)
strat(url,data)
posted @ 2021-05-07 14:15  少喝点酒  阅读(661)  评论(0)    收藏  举报