用selenium+lxml完成数据抓取

from selenium.webdriver import Chrome
from selenium.webdriver.support.ui import WebDriverWait # 显示等待
from selenium.webdriver.common.by import By # 提取页面内容
from selenium.webdriver.support import expected_conditions as EC # as EC 重新起个名叫EC 唤醒的条件
from lxml import etree
# 用selenium+lxml完成数据抓取

def get_page_soure(url):
# web.implicitly_wait(10)
web.get(url)
# 显示等待
el = WebDriverWait(web, 10, 0.5).until( # until 结束等待的条件
EC.presence_of_element_located((By.XPATH, '//*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[2]/ul/li[1]/div[1]/a/div[1]/span[1]')) # 当页面出现某个元素的时候
)
page_source = web.page_source
web.quit() # 关闭浏览器
return page_source # 拿到页面源代码


def get_job_name(page_source):
tree = etree.HTML(page_source)
job_names = tree.xpath('//*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[2]/ul/li/div[1]/a/div[1]/span[1]/text()')
# 直接复制xpath 稍微修改哈,li[1]改成li 后面加个text()
print(job_names)


if __name__ == '__main__':
web = Chrome()
url = 'https://www.zhipin.com/web/geek/job?query=python&city=101270400'
source = get_page_soure(url)
get_job_name(source)



posted @ 2023-08-11 22:55  严永富  阅读(79)  评论(0)    收藏  举报