爬取拉勾网信息

<p>---恢复内容结束---</p>

!/usr/bin/env python3

-- coding:utf-8 --

import requests
import json
from random import randint, choice
import pymongo
from time import sleep
from multiprocessing import Process, JoinableQueue as Queue

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

采集公司信息

def company(q):
# br = get_chrome()
br = login_lagou(20)
br.set_window_rect(602, 0, 600, 800)
db = get_mongodb()
# 采集到的数据放到company表中
company = db.company1

while True:
    if q.empty():
        break
    try:
        company_id = q.get()
        url = 'https://www.lagou.com/gongsi/' + str(company_id) + '.html'
        br.get(url)
        company_info = {}
        company_info['name'] = br.find_element_by_xpath('/html/body/div[3]/div/div/div[1]/h1/a').text
        company_info['job_num'] = br.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/ul/li[1]/strong').text
        company_info['efficiency'] = br.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/ul/li[2]/strong').text
        company_info['time_consuming'] = br.find_element_by_xpath(
            '/html/body/div[3]/div/div/div[2]/ul/li[3]/strong').text
        company_info['last_login'] = br.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/ul/li[5]/strong').text
        company_info['introduction'] = br.find_element_by_xpath(
            '/html/body/div[6]/div[1]/div/div[2]/div[2]/div[2]/span[1]').text
        company_info['inancing'] = br.find_element_by_xpath('/html/body/div[6]/div[2]/div[1]/div[2]/ul/li[2]/span').text
        company_info['scale'] = br.find_element_by_xpath('/html/body/div[6]/div[2]/div[1]/div[2]/ul/li[3]/span').text
        company.insert(company_info)

        # 采集公司的评价
        # company_comment(company_id, db, br)
        print('%d公司信息已采集入库' % company_id)
        q.task_done()
    except Exception as e:
        print('遇到异常', e)
        q.put(company_id)
        sleep(10)
    sleep(1)
br.close()
q.join()

采集公司的评论信息

def company_comment(id, db, br):

"""

:param id: 公司id

:param db: 数据库

:param br: 浏览器驱动

:return:

"""

url = 'https://www.lagou.com/gongsi/interviewExperiences.html?companyId='+str(id)

br.get(url)

采集工作的具体信息

def work(q):
# br = get_chrome()
br = login_lagou(20)
br.set_window_rect(101, 0, 600, 600)
db = get_mongodb()
job_table = db.job1
while True:
if q.empty():
break
try:
id = q.get()
url = 'https://www.lagou.com/jobs/' + str(id) + '.html'
br.get(url)
job = {'id': id}
content = br.find_element_by_xpath('/html/body/div[5]/div[1]/dl[1]/dd[2]').text
job['content'] = content

        job_table.insert(job)
        print('%d招聘启事具体内容已入库' % id)
        q.task_done()
    except Exception as e:
        print('遇到异常', e)
        q.put(id)
        sleep(10)
    sleep(1)
br.close()
q.join()

获得一个无界面浏览器驱动

def get_chrome():
options = Options()
# options.add_argument('--headless')
# options.add_argument('--disable-gpu')
br = webdriver.Chrome(chrome_options=options)
return br

获取一个mongodb连接对象

def get_mongodb():
# 连接mongodb
cli = pymongo.MongoClient(host='192.168.12.244', port=27017)
db = cli.xxx
db.authenticate('ss', '123456')
return db

获取cookie信息

def get_cookie(br):
# br = get_chrome()
br.get('https://www.lagou.com/')
tmp_cookies = br.get_cookies()
# 动态获取到cookies
return {i['name']: i['value'] for i in tmp_cookies}

def login_lagou(sec):
br = get_chrome()
br.get('https://www.lagou.com/frontLogin.do')
mobi = br.find_element_by_xpath('/html/body/section/div[2]/div[1]/div[2]/form/div[1]/input')
pwd = br.find_element_by_xpath('/html/body/section/div[2]/div[1]/div[2]/form/div[2]/input')
mobi.send_keys('15324818121')
pwd.send_keys('123456')
sleep(sec)
# 打开连个选项卡备用
# br.execute_script('window.open("https://www.lagou.com/")')
# br.execute_script('window.open("https://www.lagou.com/")')
# br.find_element_by_xpath('/html/body/section/div[2]/div[1]/div[2]/form/div[5]/input').click()

return br

if name == 'main':
# 存放公司信息的队列
companies = Queue()
# 存放岗位信息的队列
jobs = Queue()

br = login_lagou(20)
br.set_window_rect(0, 0, 200, 600)

# 启动一个进程采集公司的信息
c = Process(target=company, args=(companies,))
c.start()
sleep(20)
# 启动一个进程采集岗位信息
jo = Process(target=work, args=(jobs,))
jo.start()

# 准备请求头信息
header = {
    'Referer': 'https://www.lagou.com/jobs/list_Python?px=default&city=%E5%85%A8%E5%9B%BD',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}

UAs = [
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:65.0) Gecko/20100101 Firefox/65.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/601.7.7 (KHTML, like Gecko) Version/9.1.2 Safari/601.7.7'
]

cookies = get_cookie(br)



# 获取一个mongodb连接对象
db = get_mongodb()

# 采集职位信息
i = 1
header['User-Agent'] = choice(UAs)

while True:
    if i > 30:
        break
    data = {'first': 'false', 'pn': i, 'kd': 'Python'}
    re = requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false',
                       data=data, headers=header, cookies=cookies)
    content = json.loads(re.text)

    try:
        if content['success']:
            # 将职位信息放到mongodb的work表中
            tab = db.work1
            resultData = content['content']['positionResult']['result']
            tab.insert(resultData)

            for j in resultData:
                # 将该公司的id放到 公司队j列中
                companies.put(j['companyId'])
                # 把工作id放到队列中
                jobs.put(j['positionId'])
    except Exception as e:
        # 采集遇到异常的话就抛出异常病退出循环
        print('遇到异常', e, content)
        # 获取最新的cookie
        cookies = get_cookie(br)
        header['User-Agent'] = choice(UAs)
        print('更换身份,正在重试')
        # 因为后面会进行加一操作,而我们这次并没有成功猜到,那么需要重新采集
        i -= 1

    sleep_time = randint(1, 3)
    print('列表第%d页已完成, 打算睡%d秒' % (i, sleep_time), )
    sleep(sleep_time)
    i += 1
# companies.put(None)
# jobs.put(None)
br.close()

c.join()
jo.join()
posted @ 2019-03-11 20:45  叶小川  阅读(368)  评论(0编辑  收藏  举报