【爬虫】项目篇-Boss直聘
import requests, urllib, redis, pymongo, time, re, random, xlsxwriter, os, openpyxl, json, csv, pandas as pd
from urllib import request
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from Experiments.create_proxy_auth_extension import create_proxy_auth_extension
# 提取职位名称、工作地点、年薪、工作经验要求、学历、技能标签、详情链接、招聘单位和联系人
def get_proxy():
# 提取代理api接口 获取一个代理ip
api_url = "http://dps.kdlapi.com/api/getdps/?orderid=923923091789065&num=1&pt=1&sep=1&signature=fb2rysmvahtgud51wx36y3zy0guhpk30&dedup=1&whitelist=1"
# 获取api接口返回代理ip
proxy_ip = requests.get(api_url).text
print(proxy_ip)
# 用户名密码认证
username = "13645"
password = "bfnxkoxg"
proxies = {
"http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": proxy_ip},
"https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": proxy_ip}
}
return proxies, proxy_ip
def get_cookies():
script = '''
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
'''
browser = webdriver.Chrome(options=options)
browser.get('http://httpbin.org/get')
time.sleep(2)
browser.delete_all_cookies()
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": script})
browser.get("https://www.zhipin.com/c101230100/?query=%E5%A4%A7%E6%95%B0%E6%8D%AE&page=7&ka=page-7")
cookie = {}
wait = WebDriverWait(browser, 200)
wait.until(EC.presence_of_element_located((By.XPATH, '//p[@class="ipt-wrap"]/input')))
for i in browser.get_cookies():
cookie[i["name"]] = i["value"]
cookie = ";".join(['%s=%s' % (i, cookie[i]) for i in cookie])
browser.quit()
return cookie
def use_requests(url):
req = requests.get(url, headers=headers)
req.encoding = 'utf-8'
return req
def use_urllib(url):
req = urllib.request.Request(url,headers=headers)
source=urllib.request.urlopen(req)
return source
def parse_re(source):
source = source.text
txt = re.findall(r'<div class="job-list">(.*?)</ul>', source, re.S)[0]
li = re.findall('<li>(.*?)</li>', txt, re.S)
for i in li:
# 1.详情链接
href = "https://www.zhipin.com" + re.search(r'<span class="job-name"><a href="(.*?)"', i).group(1)
# 2.职位名称
name = re.search(r'<span class="job-name">.*?title="(.*?)" target', i).group(1)
# 3.工作地点
# site = re.search('<span class="job-area">(.*?)</span>', i).group(1)
drive.get(href)
wait = WebDriverWait(drive, 100)
wait.until(EC.presence_of_element_located((By.XPATH, '//div[@class="job-location-map js-open-map"]')))
site = drive.find_element_by_xpath('//div[@class="job-location-map js-open-map"]').get_attribute('data-content')
# 4.薪资
salary = re.search('<span class="red">(.*?)</span>', i).group(1)
# 5.工作经验要求
experience = re.search(r'<div class="job-limit clearfix">.*?<p>(.*?)<em class="vline">', i, re.S).group(1)
# 6.学历
tmp = re.search(r'<span class="red">.*?</span>(.*?)<div class="info-publis">', i, re.S).group(1)
tmp = re.sub(u"\<.*?\>", " ", tmp).strip().split(' ')
if len(tmp) != 2:
education = tmp[-1]
experience = " ".join(tmp[0:2])
else:
education = "/".join(tmp[1:])
# 7.技能标签
tags = re.findall(r'<span class="tag-item">(.*?)</span>', i)
tags = " / ".join(tags)
# 8.招聘单位
try:
company = re.search(r'<div class="company-text">.*?target="_blank">(.*?)</a></h3>', i, re.S).group(1)
except:
company = re.search(r'<div class="company-text">.*?custompage" >(.*?)</a></h3>', i, re.S).group(1)
if "..." in company:
company = drive.find_element_by_xpath('//div[@class="job-sec"]/div[@class="name"]').text
# 9.联系人
contacts = re.search(r'<img class="icon-chat" src=".*?"/>(.*?)<em', i, re.S).group(1)
data = [name, site, salary, experience, education, tags, company, contacts, href]
print(data)
datalist.append(data)
time.sleep(random.randint(6, 13))
def parse_xpath(source):
root_elem = etree.HTML(source)
main_text = root_elem.xpath('//div[@id="main"]/div/div[3]/ul')
name = main_text[0].xpath('li/div/div[1]/div[1]/div/div[1]/span[1]/a//text()')
site = main_text[0].xpath('li/div/div[1]/div[1]/div/div[1]/span[2]/span//text()')
salary = main_text[0].xpath('li/div/div[1]/div[1]/div/div[2]/span//text()')
experience = main_text[0].xpath('li/div/div[1]/div[1]/div/div[2]/p//text()[1]')
education = main_text[0].xpath('li/div/div[1]/div[1]/div/div[2]/p//text()[2]')
contacts = main_text[0].xpath('li/div/div[1]/div[1]/div/div[2]/div/h3//text()[1]')
company = main_text[0].xpath('li/div/div[1]/div[2]/div/h3/a//text()')
tags = [' / '.join(node.itertext()).replace(" ", "").lstrip().rstrip() for node in
main_text[0].xpath('li/div/div[2]/div[1]')]
href_tmp = main_text[0].xpath('li/div/div[1]/div[1]/div/div[1]/span/a/@href')
href = ['https://www.zhipin.com' + href_tmp[i] for i in range(0, len(href_tmp))]
for i in range(0, len(href)):
if "..." in company[i]:
drive.get(href[i])
time.sleep(2)
company[i] = drive.find_element_by_xpath('//div[@class="job-sec"]/div[@class="name"]').text
data = [name[i], site[i], salary[i], experience[i], education[i], tags[i], company[i], contacts[i], href[i]]
print(data)
datalist.append(data)
def parse_bs4(source):
soup = BeautifulSoup(source.text, 'html.parser')
soup1 = soup.select('div.job-list>ul>li')
for s in soup1:
name = s.select('span.job-name')[0].text # 职位名称
site = s.select('span.job-area')[0].text # 工作地点
salary = s.select('span.red')[0].text # 年薪
tmp = str(s.find('span',{'class':'red'}).find_next_sibling())# 工作经验要求
experience = re.findall('<p>(.*?)<',tmp,re.S)[0]
education = re.findall('</em>(.*?)<',tmp,re.S)[0] # 学历
tags = s.select('div.tags')[0].text.strip().replace("\n"," / ") # 技能标签
href = 'https://www.zhipin.com'+s.select('span.job-name')[0].a.get('href')[0] # 详情链接
company = s.select('div.company-text>h3')[0].text # 招聘单位
contacts = s.select('div.info-publis>h3')[0].next_element# 联系人
contacts="".join([ i for i in contacts][0:1])
if "..." in company:
drive.get(href)
wait = WebDriverWait(drive, 100)
wait.until(EC.presence_of_element_located((By.XPATH, '//div[@class="job-location-map js-open-map"]')))
company = drive.find_element_by_xpath('//div[@class="job-sec"]/div[@class="name"]').text
data = [name, site, salary, experience, education, tags, company, contacts, href]
print(data)
datalist.append(data)
time.sleep(2)
def save_as_csv():
file = "%s.xlsx" % filename
if os.path.exists(file) == False:
with open('%s.csv' % filename, 'w+', encoding='UTF-8-sig', newline='') as file:
writer = csv.writer(file)
writer.writerow(list(colname))
writer.writerows(datalist)
file.close()
else:
with open('%s.csv' % filename, 'a', encoding='UTF-8-sig', newline='') as file:
writer = csv.writer(file)
writer.writerows(datalist)
file.close()
def save_as_excel():
file = "%s.xlsx" % filename
# 判断文件是否存在
if os.path.exists(file) == False: # 不存在创建新的excel
workbook = xlsxwriter.Workbook(file)
worksheet = workbook.add_worksheet()
col = (colname)
# 写入列名
for i in range(0, len(col)):
worksheet.write(0, i, col[i])
# 写入数据
i = 2
for data in datalist:
j = 0
for d in data:
worksheet.write('{}{}'.format(chr(ord('A') + j), i), d)
j += 1
i += 1
workbook.close()
else: # 存在则加载文件,追加读写
wb = openpyxl.load_workbook(file)
ws = wb.active
for data in datalist:
ws.append(data)
wb.save(file)
wb.close()
def save_as_MongoDB():
client = pymongo.MongoClient("mongodb+srv://HJY:hong12345@cluster0.nhhtz.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")
#创建数据库
mydb = client['works']
#创建集合
mycollection = mydb.boss_zhipin
for data in datalist:
data = dict(zip(list(colname), data))
# 查询数据是否已存在数据库中
if mycollection.find_one({'详情链接':data['详情链接']}) != None:
pass
else:
mycollection.insert_one(data)
client.close()
def save_as_redis():
client=redis.Redis(host="localhost",port=6379,decode_responses=True,db=0)
x = 1
for data in datalist:
data = dict(zip(list(colname), data))
client.hmset(f'job:{x}', data)
x = x + 1
client.close()
if __name__ == '__main__':
# 存放所有数据
datalist = []
# 定义文件名
filename = "BossRecruitCondition"
# 定义列名
colname = '职位名称', '工作地点', '薪资', '工作经验要求', '学历', '技能标签', '招聘单位', '联系人', '详情链接'
for page in range(1,7):
# 1、使用selenium获取cookie,或手动打开浏览器复制cookie
#proxies, proxy = get_proxy()
options = webdriver.ChromeOptions()
#options.add_argument(('--proxy-server=' + proxy))
options.add_argument('user-agent='+UserAgent().Chrome)
drive = webdriver.Chrome()
url = "https://www.zhipin.com/c101230100/?query=%E5%A4%A7%E6%95%B0%E6%8D%AE" + "&page=" + str(
page) + "&ka=page-" + str(page)
print(url)
#cookie = get_cookies()
#print(cookie)
# 2、使用requests或urllib请求网页
cookie="lastCity=101230100; wd_guid=9c5a6b15-0e51-4924-9bd6-3c482a2082bd; historyState=state; _bl_uid=ILk0Rweka3UitelXRmydtgzmy1jU; acw_tc=0bdd34c616411089551044925e0194cba6d7b0cf764ad31fd698e99bc6faf7; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1638947726,1639217413,1639230417,1641108957; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1641108957; __c=1641108957; __g=-; __a=96498494.1637336807.1639230418.1641108957.326.12.1.198; __zp_stoken__=4ddadKR1TC38haGsDHktwKRNcZyNXWAYNT0E3C2IOc0EPUEFFW10EZzUhOxR8RmFlPBx0Olcfd30xIX43FnllISIJEEtGYic1KD5aeChEPj5nezBePExsTVlKT3ZqNhgufiVGTgw/dgV4ZXo="
headers = {
'user-agent': UserAgent().Chrome,
'cookie': cookie,
"referer": "https://www.zhipin.com/c101230100/?query=%E5%A4%A7%E6%95%B0%E6%8D%AE&page=1&ka=page-1",
'host': 'www.zhipin.com'
}
source = use_requests(url)
# source=use_urllib(url)
try:
# 3、判断是否被禁止访问
if len(source.text) < 10000:
print(source.url)
else:
# 4、若能正常访问,则使用re、xpath、bs4解析网页
parse_re(source)
#parse_xpath(source)
#parse_bs4(source)
except Exception as e:
print(e.args)
continue
time.sleep(random.randint(10,20))
# 5、结束请求二级页面的进程
drive.quit()
print(datalist)
# 6、保存数据
save_as_excel()
save_as_csv()
save_as_MongoDB()
save_as_redis()
爬取结果:


浙公网安备 33010602011771号