selenium - 焚音留香

公告

selenium

一、作用：解决了requests模块无法直接执行js代码的问题

二、安装：pip install selenium

三、浏览器驱动

　　1、使用selenium可以得到一个浏览器对象来操作浏览器

　　2、实例浏览器对象的时候需要传入合适的浏览器驱动

　　3、谷歌浏览器驱动大全：http://npm.taobao.org/mirrors/chromedriver/

　　4、下载完解压出exe文件

四、基本使用

from selenium import webdriver
import time

# 传入浏览器驱动的路径，生成浏览器对象
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
time.sleep(2)
# 相当于在地址栏输入该url
bro.get('https://www.baidu.com/')
time.sleep(2)
# 获取当前页面
print(bro.page_source)
time.sleep(2)
# 使用完毕需要清除浏览器对象
bro.close()

五、标签选择器

　　1、bro.find_element_by_id：通过id查找

　　2、bro.find_element_by_link_text：通过a标签的内容查找

　　3、bro.find_element_by_partial_link_text：通过a标签的内容模糊匹配

　　4、bro.find_element_by_tag_name：通过标签名查找

　　5、bro.find_element_by_class_name：通过类名查找

　　6、bro.find_element_by_name ：通过标签的name属性查找

　　7、bro.find_element_by_css_selector：通过css选择器表达式查找

　　8、bro.find_element_by_xpath：通过xpath选择器表达式查找

　　9、补充：< element >为查找单个，返回第一个匹配到的标签，若改为< elements >查找的结果是列表。

六、标签的操作

　　1、tag.get_attribute(属性名)：获取指定属性

　　2、tag.text：获取文本

　　3、tag.id：获取id

　　4、tag.tag_name：获取标签名

　　5、tag.location：获取标签在当前页面的坐标

　　6、tag.size：获取标签的尺寸

七、标签交互

　　1、tag.send_keys(内容)：传入标签内容

　　　　①tag.send_keys(Keys.ENTER)：回车操作

　　2、tag.clear()：清空标签内容

　　3、tag.click()：点击标签

八、其他方法

　　1、bro.execute_script('window.scrollTo(0,document.body.offsetHeight)')：执行js(将页面滚动到底部)

　　2、bro.implicitly_wait(秒数)：隐式等待，相当于全局的等待设置，当需要操作某个标签而该标签还未加载出来的时候，统一等待数秒

　　3、bro.back()：返回上一页

　　4、bro.forward()：前进下一页

　　5、bro.get_cookies()：获取所有cookie

　　6、bro.maximize_window()：窗口最大化(全屏)

　　7、bro.save_screenshot('./main.png')：截图整个屏幕

九、选项卡管理

from selenium import webdriver

bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.baidu.com')
# 用js打开两个空选项卡
bro.execute_script('window.open()')
bro.execute_script('window.open()')
# 切换选项卡打开页面
bro.switch_to_window(bro.window_handles[1])
bro.get('https://www.taobao.com')
bro.switch_to_window(bro.window_handles[2])
bro.get('https://www.sina.com.cn')
# 查看所有选项卡
print(bro.window_handles)

十、异常处理

from selenium import webdriver

browser = webdriver.Chrome(executable_path='./chromedriver.exe')
try:
    browser.get('https://www.baidu.com')
except Exception as e:
    print(e)
finally:
    # 无论是否出异常，最后都要清除浏览器对象
    browser.close()

十一、无界面浏览器：谷歌浏览器支持不打开页面

from selenium.webdriver.chrome.options import Options
from selenium import webdriver

chrome_options = Options()
chrome_options.add_argument('window-size=1920x3000')  # 指定浏览器分辨率
chrome_options.add_argument('--disable-gpu')  # 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--hide-scrollbars')  # 隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('blink-settings=imagesEnabled=false')  # 不加载图片, 提升速度
chrome_options.add_argument('--headless')  # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
bro = webdriver.Chrome(chrome_options=chrome_options, executable_path='./chromedriver.exe')
bro.get('https://www.baidu.com/')
print(bro.page_source)
bro.close()

十二、自动登录12306(演示动作链的使用)

import time
from PIL import Image
from chaojiying import Chaojiying_Client
from selenium import webdriver
# 导入动作链相关模块
from selenium.webdriver import ActionChains

bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.implicitly_wait(10)
try:
    bro.get('https://kyfw.12306.cn/otn/resources/login.html')
    # 浏览器全屏
    bro.maximize_window()
    # 获取登录按钮标签并点击
    button_z = bro.find_element_by_css_selector('.login-hd-account a')
    button_z.click()
    time.sleep(2)
    # 截取全屏图片
    bro.save_screenshot('./main.png')
    # 获取验证码图片标签
    img_t = bro.find_element_by_id('J-loginImg')
    # 获取验证码图片标签的位置与尺寸
    location = img_t.location
    size = img_t.size
    # 将验证码图片的起点坐标与对角坐标传入
    img_tu = (
        int(location['x']), int(location['y']),
        int(location['x'] + size['width']), int(location['y'] + size['height'])
    )
    # 打开全屏图片
    img = Image.open('./main.png')
    # 使用坐标截出验证码图片并保存
    fram = img.crop(img_tu)
    fram.save('code.png')
    # 调用超级鹰破解
    chaojiying = Chaojiying_Client('username', 'password', 'id')
    # 调用合适的方法将验证码图片传入，声明种类
    im = open(r'code.png', 'rb').read()
    res = chaojiying.PostPic(im, 9004)
    result = res['pic_str']
    # 返回结果如果有多个 111,222|111,333|222,233
    # 代表多个坐标，需要处理成[[111,222],[111,333],[222,333]]
    all_list = []
    if '|' in result:
        list_1 = result.split('|')
        count_1 = len(list_1)
        for i in range(count_1):
            xy_list = []
            x = int(list_1[i].split(',')[0])
            y = int(list_1[i].split(',')[1])
            xy_list.append(x)
            xy_list.append(y)
            all_list.append(xy_list)
    else:
        x = int(result.split(',')[0])
        y = int(result.split(',')[1])
        xy_list = []
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
    # 使用动作链，执行点击图片操作
    for a in all_list:
        x = a[0]
        y = a[1]
        ActionChains(bro).move_to_element_with_offset(img_t, x, y).click().perform()
        time.sleep(1)
    # 输入用户名与密码
    username = bro.find_element_by_id('J-userName')
    username.send_keys('login_username')
    password = bro.find_element_by_id('J-password')
    password.send_keys('login_password')
    time.sleep(3)
    submit_login = bro.find_element_by_id('J-login')
    submit_login.click()
    time.sleep(3)
    # 若登录成功，浏览器对象中可以读取返回的cookies
    print(bro.get_cookies())
    time.sleep(10)
    # 登录成功后可以进行后续访问
    bro.get('https://www.12306.cn/index/')
    time.sleep(5)
except Exception as e:
    print(e)
finally:
    bro.close()

十二、用selenium生成代理池

　　1、用selenium写一套登录脚本

　　2、注册一些小号

　　3、使用脚本遍历小号登录获取cookie放入redis中

　　4、搭建服务随机从redis中返回一个cookie

　　5、用requests爬取数据时即可携带该cookie

　　6、估算cookie的过期时间，可以用celery执行定时任务，定期清除过期的cookies再重新获取存入　　

posted on 2020-07-01 05:01 焚音留香阅读(236) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部