爬虫22-使用selenium爬取信息
1.正常使用cookie爬取拉勾网ajax数据
import requests
from lxml import etree
import time
import re
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
"Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",
"Cookie": "user_trace_token=20200226133453-084540c1-9531-4fa8-873f-0dda32aa3ca4; _ga=GA1.2.836052667.1582695295; LGUID=20200226133454-167deda5-1930-4e79-8834-719427ac01be; index_location_city=%E5%85%A8%E5%9B%BD; lagou_utm_source=A; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221707ffdf39c2c3-0001957fd8ade1-3a614f0b-2073600-1707ffdf39de5f%22%2C%22%24device_id%22%3A%221707ffdf39c2c3-0001957fd8ade1-3a614f0b-2073600-1707ffdf39de5f%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; gate_login_token=5976db005818f45ed7756b1348563965e46f1400511d886af3d4d57dd9d9166a; LG_LOGIN_USER_ID=5b895ff2a4e23c48dc4c9110a6a1361bbf709630b5b17ac6756340fef1babfbf; LG_HAS_LOGIN=1; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; privacyPolicyPopup=false; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1583857959,1583912708,1583912713; JSESSIONID=ABAAAECABGFABFF1412C84500FD39A23D7C1D5172179D66; WEBTJ-ID=20200315123348-170dc782d0e4cf-05e9fb23740e5e-3a614f0b-2073600-170dc782d0f63d; _gid=GA1.2.1720707822.1584246829; _putrc=387928C58CE0A7D1123F89F2B170EADC; login=true; unick=%E7%90%B3%E7%90%B3; TG-TRACK-CODE=index_search; X_MIDDLE_TOKEN=0a8830791829a77f99654a1bb3d568ae; LGSID=20200315140707-568ce08c-c655-44b2-9cd4-66632e1bb6f4; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist%5Fpython%2Fp-city%5F0%3F%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; _gat=1; SEARCH_ID=79abbbd66c2b4a59b7ca19ee8fb77e01; X_HTTP_TOKEN=9944cc335d13b0d30552524851b568c7665cd1a0ff; LGRID=20200315140911-acf5dfc4-1c8f-4943-a93f-983d364a96db",
"Origin": "https://www.lagou.com",
'X-Anit-Forge-Code': "0",
"X -Anit-Forge-Token": "None",
"X-Requested-With": "XMLHttpRequest"
}
positions = []
def request_list_page():
url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
data = {
"frist": "false",
"pn": "1",
"kd": "python"
}
for x in range(1,10):
data['pn']=x
response = requests.post(url, data=data, headers=headers)
result=response.json() # 如果返回的是json数据,会被load成一个字典
positions=result['content']['positionResult']['result']
for position in positions:
positionId=position['positionId']#根据这个id找页面
position_url='https://www.lagou.com/jobs/%s.html'%positionId
parse_position_detail(position_url)
break
time.sleep(2)
break
def parse_position_detail(url):
response=requests.get(url,headers=headers)
text=response.text
html=etree.HTML(text)
name=html.xpath("//div[@class='job-name']/@title")[0]
job_span=html.xpath("//dd[@class='job_request']//span")
salary=job_span[0].xpath('.//text()')[0].strip()
city=job_span[1].xpath(".//text()")[0].strip()
city=re.sub(r"[\s/]","",city)
position = {
'name': name,
'salary': salary,
'city': city
}
positions.append(position)
def main():
request_list_page()
print(positions)
if __name__ == '__main__':
main()
2.使用selenium爬取拉勾网ajax数据
#encoding: utf-8
from selenium import webdriver
from lxml import etree
import re
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class LagouSpider(object):
def __init__(self):
self.driver = webdriver.Firefox()
self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
self.positions = []
def run(self):
self.driver.get(self.url)
while True:
source = self.driver.page_source
WebDriverWait(driver=self.driver,timeout=10).until(
EC.presence_of_element_located((By.XPATH,"div[@class='pager_container']/span[last()]]"))
)
self.parse_list_page(source)
try:
next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
if "pager_next_disabled" in next_btn.get_attribute("class"):
break
else:
next_btn.click()
except:
print(source)
time.sleep(1)
def parse_list_page(self,source):
html = etree.HTML(source)
links = html.xpath("//a[@class='position_link']/@href")
for link in links:
self.request_detail_page(link)
time.sleep(1)
def request_detail_page(self,url):
# self.driver.get(url)
self.driver.execute_script("window.open('%s')"%url)#打开新标签
self.driver.switch_to.window(self.driver.window_handles[1])#driver移动到新标签
WebDriverWait(self.driver,timeout=10).until(
EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']"))
)
source = self.driver.page_source
self.parse_detail_page(source)
self.driver.close()# 关闭当前这个详情页
self.driver.switch_to.window(self.driver.window_handles[0])# 继续切换回职位列表页
def parse_detail_page(self,source):
html = etree.HTML(source)
name = html.xpath("//div[@class='job-name']/@title")[0]
job_span = html.xpath("//dd[@class='job_request']//span")
salary = job_span[0].xpath('.//text()')[0].strip()
city = job_span[1].xpath(".//text()")[0].strip()
city = re.sub(r"[\s/]", "", city)
position = {
'name': name,
'salary': salary,
'city': city
}
self.positions.append(position)
print(position)
print('='*40)
if __name__ == '__main__':
spider = LagouSpider()
spider.run()

浙公网安备 33010602011771号