import requests
import re
import time
from lxml import etree
import pymysql
class my_spider:
#初始化(第一步)
def __init__(self,num1,num2):
self.base_url = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,{}.html"
self.headers = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding":"gzip,deflate,br",
"Accept-Language":"zh-CN,zh;q=0.9",
"Cache-Control":"max-age=0",
"Connection":"keep-alive",
"Host":"search.51job.com",
"Sec-Fetch-Mode":"navigate",
"Sec-Fetch-Site":"none",
"Sec-Fetch-User":"?1",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
}
self.page_num1 = num1
self.page_num2 = num2
#定义一个存放详细页连接的列表,方便取数
self.det_link = []
#构建页面列表连接(第一步)
def get_url(self):
url_List = []
for i in range(self.page_num1,self.page_num2):
url_List.append(self.base_url.format(i))
return url_List
#获得主列表页面信息(第一步)
def get_pages(self,url):
proxy={
"http":"http://H53AXE0994W90HAD:720EEA0408F81FA2@http-dyn.abuyun.com:9020",
"https":"http://H53AXE0994W90HAD:720EEA0408F81FA2@http-dyn.abuyun.com:9020"
}
response = requests.get(url=url,headers=self.headers)
#print(response.content.decode('gbk'))
return self.parse_pages(response.content.decode('gbk'))
#解析主列表信息连接(第一步)
def parse_pages(self,text):
html_5job = etree.HTML(text)
all_div = html_5job.xpath("//div[@id='resultList']//div[@class='el']")
info_List = []
for item in all_div:
info = {}
info['job_info_link'] = item.xpath("./p/span/a/@href")[0]
info_List.append(info)
return info_List
#定义函数循环抽取页面信息
def run(self):
index_urlList = self.get_url()
#print(index_urlList)
for url in index_urlList:
time.sleep(1)
page_info = self.get_pages(url)
#print(page_info,"打印结果")
for job_info_link in page_info:
self.det_link.append(job_info_link['job_info_link'])
#获得页面信息
def get_page_info(self,url):
url = url
print(url)
proxy={
"http":"http://H53AXE0994W90HAD:720EEA0408F81FA2@http-dyn.abuyun.com:9020",
"https":"http://H53AXE0994W90HAD:720EEA0408F81FA2@http-dyn.abuyun.com:9020"
}
response = requests.get(url=url,headers=self.headers)
#print(response.content.decode('gbk'))
return self.parse_det_info(response.content.decode('gbk'))
#解析详细信息
def parse_det_info(self,pages):
item = etree.HTML(pages)
#all_div = html_51job.xpath("//div[@class='cn']")
#print(all_div)
#info_List = []
#for item in all_div:
info = {}
try:
info['job_name'] = item.xpath("//div[@class='cn']/h1/@title")[0]
except IndexError:
info['job_name'] = 'NaN'
try:
info['job_money'] = item.xpath("//div[@class='cn']/strong/text()")[0] # 这里报错
except IndexError:
info['job_money'] = 'NaN'
try:
info['company_name'] = item.xpath("//div[@class='cn']/p[@class='cname']/a/@title")[0]
except IndexError:
info['company_name'] = 'NaN'
try:
info['job_request'] = item.xpath("//div[@class='cn']/p[@class='msg ltype']/@title")[0]
except IndexError:
info['job_request'] = 'NaN'
#info_List.append(info)
return info
#main
def main(self):
self.run()
print(self.det_link)
for url in self.det_link:
#print(url)
time.sleep(1)
det_pageinfo = self.get_page_info(url)
print(det_pageinfo)
self.save_to_mysql(det_pageinfo)
#保存数据
def save_to_mysql(self, page_Info):
# 链接数据库
conn = pymysql.connect(host='localhost', user='root', passwd='root123', db='baidu', port=3306)
# 游标对象
cursor = conn.cursor()
# 插入数据
tt = page_Info
cursor.execute("insert into det_job_info(job_name,company_name,job_money,job_request) VALUES('{}','{}','{}','{}')".format(tt['job_name'],tt['company_name'],tt['job_money'],tt['job_request']))
conn.commit()
# 关闭游标,关闭连接
cursor.close()
conn.close()
if __name__ == "__main__":
#spider.get_pages()
#spider.get_url()
for i in range(159,159,2):
time.sleep(1)
spider = my_spider(159,161)
print('正在获取{}-{}页数据'.format(i,i+2))
spider.main()