selenium
selenium:
-- 概念:一个基于浏览器自动化的模块
-- 基本使用流程:
-- pip install selenium
-- 下载对应驱动程序:http://chromedriver.storage.googleapis.com/index.html
-- 实例化一个浏览器对象,将浏览器的驱动程序加载到该对象中
1.简单示例
from selenium import webdriver from lxml import etree import time # 实例化一个浏览器对象,executable_table是chromedrive.exe的路径 bro = webdriver.Chrome(executable_path='chromedriver.exe') # 让浏览器对指定url发起访问 bro.get('http://125.35.6.84:81/xk/') # 获取页面源码(可见即可得) page_text = bro.page_source tree = etree.HTML(page_text) # 可以获取动态加载的数据 name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0] print(name) time.sleep(2) bro.quit()
2.相关行为定制
打开淘宝并搜索相关内容:
from selenium import webdriver import time bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('https://www.taobao.com') # 标签定位:find系列方法 input_text = bro.find_element_by_id('q') input_text.send_keys('mac') time.sleep(2) # 执行js程序 bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') btn = bro.find_element_by_css_selector('.btn-search') btn.click() time.sleep(3) bro.quit()
常用方法:
get(url)
find系列函数进行标签定位
send_keys('key')
click()
excute_script('js_code')
page_source
switch_to.frame('iframe_ID')
quite()
save_screenshot()
a = ActionChains(bro) a.click_and_hold(tag)
tag.move_by_offset(x,y).perform
3.规避检测
from selenium import webdriver from lxml import etree from selenium.webdriver import ChromeOptions import time # 用来规避检测 option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) bro = webdriver.Chrome(executable_path='chromedriver.exe', options=option) # 让浏览器对指定url发起访问 bro.get('http://125.35.6.84:81/xk/') # 获取页面源码(可见即可得) page_text = bro.page_source tree = etree.HTML(page_text) # 可以获取动态加载的数据 name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0] print(name) time.sleep(2) bro.quit()
4.无头浏览器
设置为在浏览器不可见下进行爬取:
from selenium import webdriver from selenium.webdriver.chrome.options import Options from lxml import etree import time chrome_options = Options() # 设置不可见 chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') bro = webdriver.Chrome(executable_path='./chromedriver.exe', options=chrome_options) # 让浏览器对指定url发起访问 bro.get('http://125.35.6.84:81/xk/') # 获取页面源码(可见即可得) page_text = bro.page_source time.sleep(2) tree = etree.HTML(page_text) # 可以获取动态加载的数据 name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0] print(name) time.sleep(2) bro.quit()

浙公网安备 33010602011771号