Loading

【爬虫】项目篇-Boss直聘

import requests, urllib, redis, pymongo, time, re, random, xlsxwriter, os, openpyxl, json, csv, pandas as pd
from urllib import request
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from Experiments.create_proxy_auth_extension import create_proxy_auth_extension

# 提取职位名称、工作地点、年薪、工作经验要求、学历、技能标签、详情链接、招聘单位和联系人


def get_proxy():
    # 提取代理api接口 获取一个代理ip
    api_url = "http://dps.kdlapi.com/api/getdps/?orderid=923923091789065&num=1&pt=1&sep=1&signature=fb2rysmvahtgud51wx36y3zy0guhpk30&dedup=1&whitelist=1"
    # 获取api接口返回代理ip
    proxy_ip = requests.get(api_url).text
    print(proxy_ip)
    # 用户名密码认证
    username = "13645"
    password = "bfnxkoxg"
    proxies = {
        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": proxy_ip},
        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": proxy_ip}
    }
    return proxies, proxy_ip

def get_cookies():
    script = '''
    Object.defineProperty(navigator, 'webdriver', {
        get: () => undefined
    })
    '''
    browser = webdriver.Chrome(options=options)
    browser.get('http://httpbin.org/get')
    time.sleep(2)
    browser.delete_all_cookies()
    browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": script})
    browser.get("https://www.zhipin.com/c101230100/?query=%E5%A4%A7%E6%95%B0%E6%8D%AE&page=7&ka=page-7")
    cookie = {}
    wait = WebDriverWait(browser, 200)
    wait.until(EC.presence_of_element_located((By.XPATH, '//p[@class="ipt-wrap"]/input')))
    for i in browser.get_cookies():
        cookie[i["name"]] = i["value"]

    cookie = ";".join(['%s=%s' % (i, cookie[i]) for i in cookie])
    browser.quit()
    return cookie


def use_requests(url):
    req = requests.get(url, headers=headers)
    req.encoding = 'utf-8'
    return req

def use_urllib(url):
    req = urllib.request.Request(url,headers=headers)
    source=urllib.request.urlopen(req)
    return source


def parse_re(source):
    source = source.text
    txt = re.findall(r'<div class="job-list">(.*?)</ul>', source, re.S)[0]
    li = re.findall('<li>(.*?)</li>', txt, re.S)

    for i in li:
        # 1.详情链接
        href = "https://www.zhipin.com" + re.search(r'<span class="job-name"><a href="(.*?)"', i).group(1)

        # 2.职位名称
        name = re.search(r'<span class="job-name">.*?title="(.*?)" target', i).group(1)

        # 3.工作地点
        # site = re.search('<span class="job-area">(.*?)</span>', i).group(1)
        drive.get(href)
        wait = WebDriverWait(drive, 100)
        wait.until(EC.presence_of_element_located((By.XPATH, '//div[@class="job-location-map js-open-map"]')))
        site = drive.find_element_by_xpath('//div[@class="job-location-map js-open-map"]').get_attribute('data-content')

        # 4.薪资
        salary = re.search('<span class="red">(.*?)</span>', i).group(1)
        # 5.工作经验要求
        experience = re.search(r'<div class="job-limit clearfix">.*?<p>(.*?)<em class="vline">', i, re.S).group(1)

        # 6.学历
        tmp = re.search(r'<span class="red">.*?</span>(.*?)<div class="info-publis">', i, re.S).group(1)
        tmp = re.sub(u"\<.*?\>", " ", tmp).strip().split('  ')

        if len(tmp) != 2:
            education = tmp[-1]
            experience = " ".join(tmp[0:2])
        else:
            education = "/".join(tmp[1:])

        # 7.技能标签
        tags = re.findall(r'<span class="tag-item">(.*?)</span>', i)
        tags = " / ".join(tags)

        # 8.招聘单位
        try:
            company = re.search(r'<div class="company-text">.*?target="_blank">(.*?)</a></h3>', i, re.S).group(1)
        except:
            company = re.search(r'<div class="company-text">.*?custompage" >(.*?)</a></h3>', i, re.S).group(1)

        if "..." in company:
            company = drive.find_element_by_xpath('//div[@class="job-sec"]/div[@class="name"]').text

        # 9.联系人
        contacts = re.search(r'<img class="icon-chat" src=".*?"/>(.*?)<em', i, re.S).group(1)
        data = [name, site, salary, experience, education, tags, company, contacts, href]
        print(data)
        datalist.append(data)
        time.sleep(random.randint(6, 13))

def parse_xpath(source):
    root_elem = etree.HTML(source)
    main_text = root_elem.xpath('//div[@id="main"]/div/div[3]/ul')
    name = main_text[0].xpath('li/div/div[1]/div[1]/div/div[1]/span[1]/a//text()')
    site = main_text[0].xpath('li/div/div[1]/div[1]/div/div[1]/span[2]/span//text()')
    salary = main_text[0].xpath('li/div/div[1]/div[1]/div/div[2]/span//text()')
    experience = main_text[0].xpath('li/div/div[1]/div[1]/div/div[2]/p//text()[1]')
    education = main_text[0].xpath('li/div/div[1]/div[1]/div/div[2]/p//text()[2]')
    contacts = main_text[0].xpath('li/div/div[1]/div[1]/div/div[2]/div/h3//text()[1]')
    company = main_text[0].xpath('li/div/div[1]/div[2]/div/h3/a//text()')
    tags = [' / '.join(node.itertext()).replace(" ", "").lstrip().rstrip() for node in
            main_text[0].xpath('li/div/div[2]/div[1]')]
    href_tmp = main_text[0].xpath('li/div/div[1]/div[1]/div/div[1]/span/a/@href')
    href = ['https://www.zhipin.com' + href_tmp[i] for i in range(0, len(href_tmp))]

    for i in range(0, len(href)):
        if "..." in company[i]:
            drive.get(href[i])
            time.sleep(2)
            company[i] = drive.find_element_by_xpath('//div[@class="job-sec"]/div[@class="name"]').text
        data = [name[i], site[i], salary[i], experience[i], education[i], tags[i], company[i], contacts[i], href[i]]
        print(data)
        datalist.append(data)

def parse_bs4(source):
    soup = BeautifulSoup(source.text, 'html.parser')
    soup1 = soup.select('div.job-list>ul>li')
    for s in soup1:
        name = s.select('span.job-name')[0].text  # 职位名称

        site = s.select('span.job-area')[0].text   # 工作地点

        salary = s.select('span.red')[0].text   # 年薪

        tmp = str(s.find('span',{'class':'red'}).find_next_sibling())# 工作经验要求
        experience = re.findall('<p>(.*?)<',tmp,re.S)[0]
        education = re.findall('</em>(.*?)<',tmp,re.S)[0]       # 学历

        tags = s.select('div.tags')[0].text.strip().replace("\n"," / ")   # 技能标签


        href = 'https://www.zhipin.com'+s.select('span.job-name')[0].a.get('href')[0]  # 详情链接

        company = s.select('div.company-text>h3')[0].text   # 招聘单位

        contacts = s.select('div.info-publis>h3')[0].next_element# 联系人
        contacts="".join([ i for i in contacts][0:1])

        if "..." in company:
            drive.get(href)
            wait = WebDriverWait(drive, 100)
            wait.until(EC.presence_of_element_located((By.XPATH, '//div[@class="job-location-map js-open-map"]')))
            company = drive.find_element_by_xpath('//div[@class="job-sec"]/div[@class="name"]').text

        data = [name, site, salary, experience, education, tags, company, contacts, href]
        print(data)
        datalist.append(data)
        time.sleep(2)


def save_as_csv():
    file = "%s.xlsx" % filename
    if os.path.exists(file) == False:
        with open('%s.csv' % filename, 'w+', encoding='UTF-8-sig', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(list(colname))
            writer.writerows(datalist)
            file.close()
    else:
        with open('%s.csv' % filename, 'a', encoding='UTF-8-sig', newline='') as file:
            writer = csv.writer(file)
            writer.writerows(datalist)
            file.close()

def save_as_excel():
    file = "%s.xlsx" % filename
    # 判断文件是否存在
    if os.path.exists(file) == False:  # 不存在创建新的excel
        workbook = xlsxwriter.Workbook(file)
        worksheet = workbook.add_worksheet()
        col = (colname)
        # 写入列名
        for i in range(0, len(col)):
            worksheet.write(0, i, col[i])
        # 写入数据
        i = 2
        for data in datalist:
            j = 0
            for d in data:
                worksheet.write('{}{}'.format(chr(ord('A') + j), i), d)
                j += 1
            i += 1
        workbook.close()
    else:  # 存在则加载文件,追加读写
        wb = openpyxl.load_workbook(file)
        ws = wb.active
        for data in datalist:
            ws.append(data)
        wb.save(file)
        wb.close()

def save_as_MongoDB():
    client = pymongo.MongoClient("mongodb+srv://HJY:hong12345@cluster0.nhhtz.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")
    #创建数据库
    mydb = client['works']
    #创建集合
    mycollection = mydb.boss_zhipin
    for data in datalist:
        data = dict(zip(list(colname), data))
        # 查询数据是否已存在数据库中
        if mycollection.find_one({'详情链接':data['详情链接']}) != None:
            pass
        else:
            mycollection.insert_one(data)
    client.close()

def save_as_redis():
    client=redis.Redis(host="localhost",port=6379,decode_responses=True,db=0)
    x = 1
    for data in datalist:
        data = dict(zip(list(colname), data))
        client.hmset(f'job:{x}', data)
        x = x + 1
    client.close()


if __name__ == '__main__':
    # 存放所有数据
    datalist = []
    # 定义文件名
    filename = "BossRecruitCondition"
    # 定义列名
    colname = '职位名称', '工作地点', '薪资', '工作经验要求', '学历', '技能标签', '招聘单位', '联系人', '详情链接'

    for page in range(1,7):
        # 1、使用selenium获取cookie,或手动打开浏览器复制cookie
        #proxies, proxy = get_proxy()
        options = webdriver.ChromeOptions()
        #options.add_argument(('--proxy-server=' + proxy))
        options.add_argument('user-agent='+UserAgent().Chrome)
        drive = webdriver.Chrome()
        url = "https://www.zhipin.com/c101230100/?query=%E5%A4%A7%E6%95%B0%E6%8D%AE" + "&page=" + str(
            page) + "&ka=page-" + str(page)
        print(url)
        #cookie = get_cookies()
        #print(cookie)
        # 2、使用requests或urllib请求网页
        cookie="lastCity=101230100; wd_guid=9c5a6b15-0e51-4924-9bd6-3c482a2082bd; historyState=state; _bl_uid=ILk0Rweka3UitelXRmydtgzmy1jU; acw_tc=0bdd34c616411089551044925e0194cba6d7b0cf764ad31fd698e99bc6faf7; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1638947726,1639217413,1639230417,1641108957; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1641108957; __c=1641108957; __g=-; __a=96498494.1637336807.1639230418.1641108957.326.12.1.198; __zp_stoken__=4ddadKR1TC38haGsDHktwKRNcZyNXWAYNT0E3C2IOc0EPUEFFW10EZzUhOxR8RmFlPBx0Olcfd30xIX43FnllISIJEEtGYic1KD5aeChEPj5nezBePExsTVlKT3ZqNhgufiVGTgw/dgV4ZXo="
        headers = {
            'user-agent': UserAgent().Chrome,
            'cookie': cookie,
            "referer": "https://www.zhipin.com/c101230100/?query=%E5%A4%A7%E6%95%B0%E6%8D%AE&page=1&ka=page-1",
            'host': 'www.zhipin.com'
        }
        source = use_requests(url)
        # source=use_urllib(url)

        try:
            # 3、判断是否被禁止访问
            if len(source.text) < 10000:
                print(source.url)
            else:
                # 4、若能正常访问,则使用re、xpath、bs4解析网页
                parse_re(source)
                #parse_xpath(source)
                #parse_bs4(source)
        except Exception as e:
            print(e.args)
            continue
        time.sleep(random.randint(10,20))
        # 5、结束请求二级页面的进程
        drive.quit()
    print(datalist)

    # 6、保存数据
    save_as_excel()
    save_as_csv()
    save_as_MongoDB()
    save_as_redis()

爬取结果:
image

posted @ 2024-04-05 23:37  踩坑大王  阅读(151)  评论(0)    收藏  举报