"""
https://www.51job.com/
"""
import requests
import requests.adapters
import string
import urllib.parse
from lxml import etree
import redis
from pyquery import PyQuery as pq
import multiprocessing
import pymongo
import datetime
import xlsxwriter
retries = 5 # 最大尝试次数
redis_key_page = 'redis_key_page'
redis_key_detail = 'redis_key_detail'
def get_url_txt(url, headers, encoding, data=None):
ret = ''
try:
requests.adapters.DEFAULT_RETRIES = 5
session = requests.session()
session.keep_alive = False
if data is None:
response = session.get(url, headers=headers)
else:
response = session.get(url, headers=headers, data=data)
if response.status_code == 200:
response.encoding = encoding
ret = response.text
response.close()
session.close()
except Exception as e:
print(e)
return ret
def encode_url(keys):
try:
keys_str = urllib.parse.quote(keys, safe=string.printable)
keys_str_ = ''
for i in keys_str.split('%'):
if len(i):
keys_str_ += ('%25' + i)
return keys_str_
except Exception as e:
print(e)
return ''
def get_page_count(keys):
try:
keys_str = encode_url(keys)
url1 = 'https://search.51job.com/list/030000,000000,0000,00,9,99,'
url2 = keys_str
url3 = ',2,'
url4 = str(1)
url5 = '.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
url = url1 + url2 + url3 + url4 + url5
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
encoding = 'gbk'
text = get_url_txt(url=url, headers=headers, encoding=encoding)
# print(text)
xpath = etree.HTML(text)
page_count = ''.join(xpath.xpath('//*[@id="resultList"]/div[2]/div[5]//text()'))
page_count = int(page_count.split('/')[1].replace(' ', ''))
# print(page_count)
return page_count
except Exception as e:
print(e)
return 0
def flush_page_href():
try:
redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).delete(
redis_key_page)
except Exception as e:
print(e)
def get_page_href(keys):
try:
flush_page_href()
page_count = get_page_count(keys)
keys_str = encode_url(keys)
url1 = 'https://search.51job.com/list/030000,000000,0000,00,9,99,'
url2 = keys_str
url3 = ',2,'
url5 = '.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
for i in range(page_count):
url4 = str(1 + i)
url = url1 + url2 + url3 + url4 + url5
redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).rpush(
redis_key_page, '0|' + url)
except Exception as e:
print(e)
def flush_detail_href():
try:
redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=1)).delete(
redis_key_detail)
except Exception as e:
print(e)
def get_detail_href_(redis_value):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
encoding = 'gbk'
conn_count = int(redis_value.split('|')[0])
page_url = redis_value.split('|')[1]
text = get_url_txt(url=page_url, headers=headers, encoding=encoding)
if len(text):
doc1 = pq(text)
doc2 = doc1('.t1')
# print(doc2)
for i in doc2:
title = pq(i).find('a').attr('title')
href = pq(i).find('a').attr('href')
if title is None and href is None:
continue
# print(title, href)
redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=1)).rpush(
redis_key_detail, '0|' + href)
elif conn_count < retries:
redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).rpush(
redis_key_page, str(conn_count + 1) + '|' + page_url)
except Exception as e:
print(e)
def get_detail_href():
try:
flush_detail_href()
while redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).llen(
redis_key_page) > 0:
p = multiprocessing.Pool()
while True:
redis_value = redis.StrictRedis(
connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).lpop(
redis_key_page)
if redis_value is None:
break
redis_value = redis_value.decode(encoding='utf8', errors='ignore')
# print(redis_value)
p.apply_async(get_detail_href_, (redis_value,))
p.close()
p.join()
except Exception as e:
print(e)
def get_detail_info_(redis_value):
company, position, salary, require, contack, introduce = '', '', '', '', '', ''
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
encoding = 'gbk'
conn_count = int(redis_value.split('|')[0])
detail_href = redis_value.split('|')[1]
text = get_url_txt(url=detail_href, headers=headers, encoding=encoding)
if len(text):
xpath = etree.HTML(text)
company = xpath.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]//text()')
position = xpath.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/h1//text()')
salary = xpath.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/strong//text()')
require = xpath.xpath('/html/body/div[3]/div[2]/div[3]/div[1]//text()')
contack = xpath.xpath('/html/body/div[3]/div[2]/div[3]/div[2]//text()')
introduce = xpath.xpath('/html/body/div[3]/div[2]/div[3]/div[3]//text()')
company = ''.join(company).strip()
position = ''.join(position).strip()
salary = ''.join(salary).strip()
require = ''.join(require).replace('\r\n', '').strip()
contack = ''.join(contack).replace('\r\n', '').strip()
introduce = ''.join(introduce).replace('\r\n', '').strip()
# print(company, position, salary, require, contack, introduce)
pymongo.MongoClient('localhost:27017')['db']['table'].insert_one(
{'company': company,
'position': position,
'salary': salary,
'require': require,
'contack': contack,
'introduce': introduce, })
elif conn_count < retries:
redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=1)).rpush(
redis_key_detail, str(conn_count + 1) + '|' + detail_href)
except Exception as e:
print(e)
return company, position, salary, require, contack, introduce
def get_detail_info():
try:
while redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=1)).llen(
redis_key_detail) > 0:
p = multiprocessing.Pool()
while True:
redis_value = redis.StrictRedis(
connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=1)).lpop(
redis_key_detail)
if redis_value is None:
break
redis_value = redis_value.decode(encoding='utf8', errors='ignore')
# print(redis_value)
p.apply_async(get_detail_info_, (redis_value,))
p.close()
p.join()
except Exception as e:
print(e)
if __name__ == '__main__':
pass
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
# encoding = 'gbk'
# print(get_url_txt('https://jobs.51job.com/guangzhou-thq/114903412.html?s=01&t=0', headers, encoding))
pass
# print(encode_url('逆向'))
# print(get_page_count('逆向'))
# get_page_href('逆向')
# get_detail_href_(
# 'https://search.51job.com/list/030000,000000,0000,00,9,99,%25E9%2580%2586%25E5%2590%2591,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=')
# get_detail_href()
# print(get_detail_info_('https://jobs.51job.com/guangzhou-thq/119363494.html?s=01&t=0'))
# get_detail_info()
# pymongo.MongoClient('localhost:27017')['db']['table'].insert_one({'k1': 'v1', 'k2': 'v2'})
pass
# start = datetime.datetime.now()
# print(start.strftime('%Y-%m-%d %H:%M:%S'))
# pymongo.MongoClient('localhost:27017')['db']['table'].drop()
# get_page_href('逆向')
# get_detail_href()
# get_detail_info()
# end = datetime.datetime.now()
# print(end.strftime('%Y-%m-%d %H:%M:%S'))
# print('cost seconds : %d' % (end - start).seconds)
pass
cols = len(pymongo.MongoClient('localhost:27017')['db']['table'].find_one())
book = xlsxwriter.Workbook('query.xlsx')
sheet = book.add_worksheet('sheet1')
row = 0
for i in pymongo.MongoClient('localhost:27017')['db']['table'].find({}):
sheet.write(row, 0, i['company'])
sheet.write(row, 1, i['position'])
sheet.write(row, 2, i['salary'])
sheet.write(row, 3, i['require'])
sheet.write(row, 4, i['contack'])
sheet.write(row, 5, i['introduce'])
row += 1
book.close()
pass