Python.51job

"""
https://www.51job.com/
"""

import requests
import requests.adapters
import string
import urllib.parse
from lxml import etree
import redis
from pyquery import PyQuery as pq
import multiprocessing
import pymongo
import datetime
import xlsxwriter

retries = 5  # 最大尝试次数
redis_key_page = 'redis_key_page'
redis_key_detail = 'redis_key_detail'


def get_url_txt(url, headers, encoding, data=None):
    ret = ''
    try:
        requests.adapters.DEFAULT_RETRIES = 5
        session = requests.session()
        session.keep_alive = False
        if data is None:
            response = session.get(url, headers=headers)
        else:
            response = session.get(url, headers=headers, data=data)
        if response.status_code == 200:
            response.encoding = encoding
            ret = response.text
        response.close()
        session.close()
    except Exception as e:
        print(e)
    return ret


def encode_url(keys):
    try:
        keys_str = urllib.parse.quote(keys, safe=string.printable)
        keys_str_ = ''
        for i in keys_str.split('%'):
            if len(i):
                keys_str_ += ('%25' + i)
        return keys_str_
    except Exception as e:
        print(e)
    return ''


def get_page_count(keys):
    try:
        keys_str = encode_url(keys)
        url1 = 'https://search.51job.com/list/030000,000000,0000,00,9,99,'
        url2 = keys_str
        url3 = ',2,'
        url4 = str(1)
        url5 = '.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
        url = url1 + url2 + url3 + url4 + url5
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
        encoding = 'gbk'
        text = get_url_txt(url=url, headers=headers, encoding=encoding)
        # print(text)
        xpath = etree.HTML(text)
        page_count = ''.join(xpath.xpath('//*[@id="resultList"]/div[2]/div[5]//text()'))
        page_count = int(page_count.split('/')[1].replace(' ', ''))
        # print(page_count)
        return page_count
    except Exception as e:
        print(e)
    return 0


def flush_page_href():
    try:
        redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).delete(
            redis_key_page)
    except Exception as e:
        print(e)


def get_page_href(keys):
    try:
        flush_page_href()
        page_count = get_page_count(keys)
        keys_str = encode_url(keys)
        url1 = 'https://search.51job.com/list/030000,000000,0000,00,9,99,'
        url2 = keys_str
        url3 = ',2,'
        url5 = '.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
        for i in range(page_count):
            url4 = str(1 + i)
            url = url1 + url2 + url3 + url4 + url5
            redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).rpush(
                redis_key_page, '0|' + url)
    except Exception as e:
        print(e)


def flush_detail_href():
    try:
        redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=1)).delete(
            redis_key_detail)
    except Exception as e:
        print(e)


def get_detail_href_(redis_value):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
        encoding = 'gbk'
        conn_count = int(redis_value.split('|')[0])
        page_url = redis_value.split('|')[1]
        text = get_url_txt(url=page_url, headers=headers, encoding=encoding)
        if len(text):
            doc1 = pq(text)
            doc2 = doc1('.t1')
            # print(doc2)
            for i in doc2:
                title = pq(i).find('a').attr('title')
                href = pq(i).find('a').attr('href')
                if title is None and href is None:
                    continue
                # print(title, href)
                redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=1)).rpush(
                    redis_key_detail, '0|' + href)
        elif conn_count < retries:
            redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).rpush(
                redis_key_page, str(conn_count + 1) + '|' + page_url)
    except Exception as e:
        print(e)


def get_detail_href():
    try:
        flush_detail_href()
        while redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).llen(
                redis_key_page) > 0:
            p = multiprocessing.Pool()
            while True:
                redis_value = redis.StrictRedis(
                    connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).lpop(
                    redis_key_page)
                if redis_value is None:
                    break
                redis_value = redis_value.decode(encoding='utf8', errors='ignore')
                # print(redis_value)
                p.apply_async(get_detail_href_, (redis_value,))
            p.close()
            p.join()
    except Exception as e:
        print(e)


def get_detail_info_(redis_value):
    company, position, salary, require, contack, introduce = '', '', '', '', '', ''
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
        encoding = 'gbk'
        conn_count = int(redis_value.split('|')[0])
        detail_href = redis_value.split('|')[1]
        text = get_url_txt(url=detail_href, headers=headers, encoding=encoding)
        if len(text):
            xpath = etree.HTML(text)
            company = xpath.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]//text()')
            position = xpath.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/h1//text()')
            salary = xpath.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/strong//text()')
            require = xpath.xpath('/html/body/div[3]/div[2]/div[3]/div[1]//text()')
            contack = xpath.xpath('/html/body/div[3]/div[2]/div[3]/div[2]//text()')
            introduce = xpath.xpath('/html/body/div[3]/div[2]/div[3]/div[3]//text()')
            company = ''.join(company).strip()
            position = ''.join(position).strip()
            salary = ''.join(salary).strip()
            require = ''.join(require).replace('\r\n', '').strip()
            contack = ''.join(contack).replace('\r\n', '').strip()
            introduce = ''.join(introduce).replace('\r\n', '').strip()
            # print(company, position, salary, require, contack, introduce)
            pymongo.MongoClient('localhost:27017')['db']['table'].insert_one(
                {'company': company,
                 'position': position,
                 'salary': salary,
                 'require': require,
                 'contack': contack,
                 'introduce': introduce, })
        elif conn_count < retries:
            redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=1)).rpush(
                redis_key_detail, str(conn_count + 1) + '|' + detail_href)
    except Exception as e:
        print(e)
    return company, position, salary, require, contack, introduce


def get_detail_info():
    try:
        while redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=1)).llen(
                redis_key_detail) > 0:
            p = multiprocessing.Pool()
            while True:
                redis_value = redis.StrictRedis(
                    connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=1)).lpop(
                    redis_key_detail)
                if redis_value is None:
                    break
                redis_value = redis_value.decode(encoding='utf8', errors='ignore')
                # print(redis_value)
                p.apply_async(get_detail_info_, (redis_value,))
            p.close()
            p.join()
    except Exception as e:
        print(e)


if __name__ == '__main__':
    pass
    # headers = {
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
    # encoding = 'gbk'
    # print(get_url_txt('https://jobs.51job.com/guangzhou-thq/114903412.html?s=01&t=0', headers, encoding))
    pass
    # print(encode_url('逆向'))
    # print(get_page_count('逆向'))
    # get_page_href('逆向')
    # get_detail_href_(
    #     'https://search.51job.com/list/030000,000000,0000,00,9,99,%25E9%2580%2586%25E5%2590%2591,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=')
    # get_detail_href()
    # print(get_detail_info_('https://jobs.51job.com/guangzhou-thq/119363494.html?s=01&t=0'))
    # get_detail_info()
    # pymongo.MongoClient('localhost:27017')['db']['table'].insert_one({'k1': 'v1', 'k2': 'v2'})
    pass
    # start = datetime.datetime.now()
    # print(start.strftime('%Y-%m-%d %H:%M:%S'))
    # pymongo.MongoClient('localhost:27017')['db']['table'].drop()
    # get_page_href('逆向')
    # get_detail_href()
    # get_detail_info()
    # end = datetime.datetime.now()
    # print(end.strftime('%Y-%m-%d %H:%M:%S'))
    # print('cost seconds : %d' % (end - start).seconds)
    pass
    cols = len(pymongo.MongoClient('localhost:27017')['db']['table'].find_one())
    book = xlsxwriter.Workbook('query.xlsx')
    sheet = book.add_worksheet('sheet1')
    row = 0
    for i in pymongo.MongoClient('localhost:27017')['db']['table'].find({}):
        sheet.write(row, 0, i['company'])
        sheet.write(row, 1, i['position'])
        sheet.write(row, 2, i['salary'])
        sheet.write(row, 3, i['require'])
        sheet.write(row, 4, i['contack'])
        sheet.write(row, 5, i['introduce'])
        row += 1
    book.close()
    pass
posted @ 2020-03-15 01:17  N.everever  阅读(238)  评论(0)    收藏  举报