selenium

selenium:

  -- 概念:一个基于浏览器自动化的模块

  -- 基本使用流程:

    -- pip install selenium

    -- 下载对应驱动程序:http://chromedriver.storage.googleapis.com/index.html

    -- 实例化一个浏览器对象,将浏览器的驱动程序加载到该对象中

  

1.简单示例

from selenium import webdriver
from lxml import etree
import time

# 实例化一个浏览器对象,executable_table是chromedrive.exe的路径
bro = webdriver.Chrome(executable_path='chromedriver.exe')
# 让浏览器对指定url发起访问
bro.get('http://125.35.6.84:81/xk/')
# 获取页面源码(可见即可得)
page_text = bro.page_source
tree = etree.HTML(page_text)
# 可以获取动态加载的数据
name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0]
print(name)
time.sleep(2)
bro.quit()

 

2.相关行为定制

打开淘宝并搜索相关内容:

from selenium import webdriver
import time

bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.get('https://www.taobao.com')
# 标签定位:find系列方法
input_text = bro.find_element_by_id('q')
input_text.send_keys('mac')
time.sleep(2)
# 执行js程序
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
btn = bro.find_element_by_css_selector('.btn-search')
btn.click()
time.sleep(3)
bro.quit()

 常用方法:

    get(url)

    find系列函数进行标签定位

    send_keys('key')

    click()

    excute_script('js_code')

    page_source

    switch_to.frame('iframe_ID')

    quite()

    save_screenshot()

    a = ActionChains(bro)   a.click_and_hold(tag)

    tag.move_by_offset(x,y).perform

 

3.规避检测

from selenium import webdriver
from lxml import etree
from selenium.webdriver import ChromeOptions
import time

# 用来规避检测
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
bro = webdriver.Chrome(executable_path='chromedriver.exe', options=option)
# 让浏览器对指定url发起访问
bro.get('http://125.35.6.84:81/xk/')
# 获取页面源码(可见即可得)
page_text = bro.page_source
tree = etree.HTML(page_text)
# 可以获取动态加载的数据
name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0]
print(name)
time.sleep(2)
bro.quit()

 

4.无头浏览器

设置为在浏览器不可见下进行爬取:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import etree
import time

chrome_options = Options()
# 设置不可见
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
bro = webdriver.Chrome(executable_path='./chromedriver.exe', options=chrome_options)
# 让浏览器对指定url发起访问
bro.get('http://125.35.6.84:81/xk/')
# 获取页面源码(可见即可得)
page_text = bro.page_source
time.sleep(2)
tree = etree.HTML(page_text)
# 可以获取动态加载的数据
name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0]
print(name)
time.sleep(2)
bro.quit()

 

 

 

  

posted @ 2019-10-02 19:07  tianqibucuo  阅读(170)  评论(0)    收藏  举报