# 有问题
from selenium import webdriver
import time
from lxml import etree
class LagouSpider(object):
driver_path = r"G:\Crawler and Data\chromedriver.exe"
def __init__(self):
self.driver = webdriver.Chrome(executable_path=self.driver_path)
self.url = "https://www.zhipin.com/job_detail/?query=python&city=101010100&industry=&position="
self.positions = []
self.position_dict = {}
self.detail_url_list = []
def run(self):
# 访问首页
self.driver.get(self.url)
# 获取页面信息
# page_source可以获取页面的所有数据,包括每个职位的链接
source= self.driver.page_source
self.parse_list_page(source)
def parse_list_page(self,source):
# 每个职位的链接
tree = etree.HTML(source)
# 获取职位的链接 ******
li_list = tree.xpath("//div[@class='job-box']/div[@class='job-list']/ul/li")
for li in li_list:
detail_url = li.xpath('.//div[@class="info-primary"]/h3/a/@href')[0]
detail_url = "https://www.zhipin.com"+detail_url
print(detail_url)
self.detail_url_list.append(detail_url)
title = li.xpath('.//div[@class="info-primary"]/h3/a/div[@class="job-title"]/text()')[0]
salary = li.xpath('.//div[@class="info-primary"]/h3/a/span[@class="red"]/text()')[0]
company = li.xpath('.//div[@class="info-company"]//h3/a/text()')[0]
self.position_dict["title"]=title
self.position_dict["salary"]=salary
self.position_dict["company"]=company
self.detail_page(detail_url)
# break
def detail_page(self,url):
for url in self.detail_url_list:
# self.driver.get(url) # 直接访问这个url
self.driver.execute_script('window.open("%s")'%url) # 新打开一个窗口
self.driver.switch_to.window(self.driver.window_handles[1]) # 切换到新窗口
source = self.driver.page_source
tree = etree.HTML(source)
desc = tree.xpath("//div[@id='main']/div[3]/div/div[2]/div[2]/div[1]/div")
# 获取一个标签(含有其他标签)下所有的文本
desc_text = desc[0].xpath('string()').strip()
self.position_dict['desc_text'] = desc_text
print(self.position_dict)
time.sleep(2)
self.driver.close() # 关闭页面
self.driver.switch_to.window(self.driver.window_handles[0]) # 切换到新窗口
if __name__ == '__main__':
spider = LagouSpider()
spider.run()