国企、互联网大厂招聘平台的招聘信息爬取

国企

国资央企招聘平台集成了众多国企央企的招聘。写一个脚本实现一下各个岗位的招聘

接口

当抓包的时候发现,这些岗位信息都是通过接口发布的。因此可以通过获取这个接口的数据爬取。
image

编写代码:

'''
基于国资央企招聘平台的招聘信息。
https://cujiuye.iguopin.com/
'''


import requests
class get_guoqi_job():
    def __init__(self):
        self.url = "https://gp-api.iguopin.com/api/jobs/v1/list"

        self.headers = {
            "Content-Type": "application/json;charset=UTF-8",
            "Accept": "application/json, text/plain, */*",
            "Device": "pc",
            "Subsite": "cujiuye",
            "Version": "5.0.0",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
        }

    '''
    默认关键词 keyword = 网络安全 信息安全,工作数量爬取最大为200,页数为1,如果工作数量大于等于200,将page改为2再次爬取.
    直接调用 get_guoqi_job()
    关键词调用:
    get_guoqi_job('销售‘)
    '''

    def __call__(self,keyword='网络安全 信息安全',job_num=200 , page=1):
        data = {
            "page": page,
            "page_size": job_num,
            "keyword": keyword,
            "nature": ["115xW5oQ"]
        }

        response = requests.post(self.url, json=data, headers=self.headers)


        jobs = []
        # 打印响应的 JSON 内容
        res_json = response.json()

        data = res_json['data']
        # print(data)
        data_list = data['list']
        print(f'-------------有{len(data_list)}个相关{keyword}岗位正在校招-------------')
        num =1
        for company in data_list:
            job_id = company['job_id']
            job_url = 'https://www.iguopin.com/job/detail?id=' + str(job_id)
            job_name = company['job_name']
            company_name = company['company_name']
            salary = str(company['min_wage']) + '-' + str(company['max_wage'])
            education_cn = company['education_cn']
            contents = company['contents']
            area_cn = company['district_list'][0]['area_cn']
            start_end_time = company['start_time'] + '——' + company['end_time']
            company_info = company['company_info']['nature_cn']
            print('岗位:',job_name, '工作地点:', area_cn, '薪资:', salary, education_cn)
            print('公司:', company_name , company_info)
            print('岗位链接:', job_url)
            print('招聘时间:', start_end_time)
            print('职位要求:', contents)
            print(f'------------------------------------------------------{num}')
            num +=1
            job = {
                '岗位' : job_name,
                '工作地点': area_cn,
                '薪资': salary,
                '学历': education_cn,
                '公司信息': company_name + ' ' + company_info,
                '岗位链接:': job_url,
                '招聘时间':start_end_time,
                '职位要求': contents

            }
            jobs.append(job)
        return jobs

if __name__ == '__main__':

    get = get_guoqi_job()
    get()

效果:
image

阿里巴巴

import requests
import json
def get_alibaba_job():
    url = 'https://talent-holding.alibaba.com/position/search?_csrf=95fe7afd-a683-428d-94ea-4d8abad74ee9'
    headers = {
        'Host': 'talent-holding.alibaba.com',
        'Cookie': (
            'cna=FbAqHw6CgFYBASQOBFzhQPnx; '
            'xlly_s=1; '
            'XSRF-TOKEN=95fe7afd-a683-428d-94ea-4d8abad74ee9; '
            'SESSION=MTc4NUU4NzM2QzY2N0YyMUMzMzBFNkI0QjA2RjBCQkU=; '
            'prefered-lang=zh; '
            'tfstk=fu2mfimSNSlXio-Oo8Df1UOzqVC-GEMsAPptWA3Na4uWBiaOc5mZyruxBNNYsF0uSAEA1sGiI44skjHVlA2gfkvxuAHx7OqQoqhT0PBi7kMq1S3tkVDgfyjRJ9BLcoMs_wQdpXPe1UkB_IJwWLRrYsgg0hWLcotymFSLa9di-pihjVzZgbkrD4o23qzZ44uSfc8qbKSu44iZQKoq0Qorfc-27AljNsubQ8yPoHCMjS0vJ8nmiVvtrd0gUphj8suyBd3SXjVY04vwQ8V5DeFEu6t-5x3bfm41dKMqsSzm4AXMUx4b0RluBaBKikoa5xePbIuUMrHz3WxwQuDmufejGEbo8-ZzdYlRLMqzFrEbElKNQ0UKzowqI9SInxuqE0wOPKuga5yxGATVoArobgzpaB8judiPX8R61joSq21dDf0zvvBmjgjkTGGqVmgdqgA61joSVQslqB5qg0i-u; '
            'isg=BNDQjZ-KgvBCWF70uIVC5vNToR4imbTj_VG2LMqhfCvABXKvcqsWcRHz2c3l02y7'
        ),
        'Content-Length': '200',
        'Sec-Ch-Ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
        'Accept': 'application/json, text/plain, */*',
        'Sec-Ch-Ua-Platform': '"Windows"',
        'Bx-V': '2.5.11',
        'Sec-Ch-Ua-Mobile': '?0',
        'User-Agent': (
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
            'Chrome/127.0.0.0 Safari/537.36'
        ),
        'Content-Type': 'application/json',
        'Origin': 'https://talent-holding.alibaba.com',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Dest': 'empty',
        'Referer': 'https://talent-holding.alibaba.com/campus/position-list?campusType=internship&lang=zh',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'Priority': 'u=1, i'
    }

    data = {
        "channel": "campus_group_official_site",
        "language": "zh",
        "pageSize": 100,
        "batchId": "",
        "subCategories": "",
        "regions": "",
        "customDeptCode": "",
        "corpCode": "",
        "pageIndex": 1,
        "key": "",
        "categoryType": "internship"
    }

    response = requests.post(url, headers=headers, json=data)

    res = response.json()

    datas = res['content']['datas']
    jobs=[]
    num =1
    print(f'------阿里巴巴招聘实习岗位有{len(datas)}个,可能是安全岗位的如下--------------')
    for data in datas:
        job_name = data['name']
        job_base = data['workLocations']
        job_requir = data['requirement']
        job_descrip = data['description']
        job_id = data['id']
        job_url = 'https://talent-holding.alibaba.com/campus/position-detail?lang=zh&positionId=' + str(job_id)

        job ={

            '岗位名称': job_name,
            '岗位链接': job_url,
            '工作地点': job_base,
            '岗位要求': job_requir,
            '岗位介绍': job_descrip,
        }
        if '安全' in job_name or '网络' in job_name:
            for k,v in job.items():
                print(k,v)
            print(f'------------------------------------------------{num}')
            num += 1
        jobs.append(job)


    return jobs

if __name__ == '__main__':
    get_alibaba_job()

效果:
image

字节跳动

接口爬取有token验证,会过期不稳定,采用selenium爬取。模拟浏览器打开网址,再抓取最终的源码进行爬取。

from selenium import webdriver
from bs4 import BeautifulSoup

def get_ziji_job():
    broser = webdriver.Chrome()
    keyword = '网络%20安全%20信息'
    broser.get(f'https://jobs.bytedance.com/campus/position/list?keywords={keyword}&category=&location=&project=&type=&job_hot_flag=&current=1&limit=100&functionCategory=&tag=')
    content = broser.page_source
    soup = BeautifulSoup(content,'html.parser')
    div = soup.find_all('div', class_ = 'listItems__1q9i5')[0]
    a_s = div.find_all('a')
    jobs = []
    num =1
    print(f'-------字节跳动共有{len(a_s)}个安全相关岗位正在招聘------------')
    for a in a_s:
        url_ = 'https://jobs.bytedance.com'+a['href']
        job_name = a.find_all('span', class_='content__3ZUKJ clamp-content')[0].string
        job_base = a.find_all('div', class_ = 'subTitle__3sRa3 positionItem-subTitle')[0].find_all('span')[0].string
        job_descrip = a.find_all('div', class_='jobDesc__3ZDgU positionItem-jobDesc')[0].string
        # print(url_,job_name, job_base, job_descrip)
        job = {
            '岗位名称': job_name,
            '岗位链接': url_,
            '岗位地点': job_base,
            '岗位描述': job_descrip
        }
        jobs.append(job)
        for k,v in job.items():
            print(k,v)
        print(f'----------------------------------{num}')
        num+=1
        return jobs
if __name__ == '__main__':
    jobs = get_ziji_job()
    print(jobs)

也有缺点就是需要打开浏览器。速度没requests快。

posted @ 2024-07-25 19:15  云岛夜川川  阅读(237)  评论(0)    收藏  举报