selenium的使用
1 selenium的使用
1.0 基本使用
from selenium import webdriver
import time
# 浏览器对象
bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.implicitly_wait(10) # 隐士等待,去找控件,如果没有会等10s
bro.get('https://www.baidu.com/')
# sub_button=bro.find_element_by_css_selector('#s-top-loginbtn')
sub_button = bro.find_element_by_id('s-top-loginbtn') # 如果有id,优先用它
# 点击
sub_button.click()
# 找到用户名密码登录
user_btn = bro.find_element_by_xpath('//*[@id="TANGRAM__PSP_11__footerULoginBtn"]')
# user_btn=bro.find_element_by_id('TANGRAM__PSP_11__footerULoginBtn')
user_btn.click()
username = bro.find_element_by_id('TANGRAM__PSP_11__userName')
password = bro.find_element_by_id('TANGRAM__PSP_11__password')
# 往输入框中写东西
username.send_keys('6666666@qq.com')
password.send_keys('lqz12345')
sumbit_btn = bro.find_element_by_id('TANGRAM__PSP_11__submit')
time.sleep(3)
sumbit_btn.click()
time.sleep(3)
bro.close()
1.1 无头浏览器
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
driver=webdriver.Chrome(executable_path='chromedriver.exe',chrome_options=chrome_options)
driver.get('https://www.baidu.com')
print(driver.page_source)
driver.close()
1.2 获取元素位置,属性,大小
from selenium import webdriver
import time
driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://kyfw.12306.cn/otn/resources/login.html')
driver.implicitly_wait(10)
user_login=driver.find_element_by_css_selector('.login-hd-account>a')
user_login.click()
time.sleep(2)
img=driver.find_element_by_id('J-loginImg')
print(img)
print(img.id) #selenium提供的id,忽略
print(img.tag_name) # 标签名
print('-----')
print(img.location) # img标签的位置
print(img.size) # img标签大小
# 获取属性
# print(img.get_attribute('src'))
print(img.get_attribute('class'))
driver.close()
1.3 等待元素被加载
from selenium import webdriver
# 两种等待方式
# 显示等待
# 隐式等待:只需要写一句话,等待所有要获取的标签
driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://www.baidu.com')
'''
# 两种等待方式
# 显示等待(忽略掉)
wait=WebDriverWait(driver,10)
wait.until(EC.presence_of_element_located((By.ID,'content_left')))
contents=browser.find_element(By.CSS_SELECTOR,'#content_left')
# 隐式等待:
-driver.implicitly_wait(10)
-driver.find_element_by_css_selector()
-只需要写一句话,等待所有要获取的标签
'''
driver.implicitly_wait(10)
print(driver.page_source)
# 再找控件,只要没加载成功,就会等待,最多等10s
driver.close()
1.4 元素操作
from selenium import webdriver
import time
driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://www.baidu.com')
driver.implicitly_wait(10)
## 点击,清空,输入操作
input_search=driver.find_element_by_id('kw')
input_search.send_keys('美女') # 输入
time.sleep(3)
input_search.clear() # 清空
time.sleep(2)
input_search.send_keys('性感美女')
time.sleep(2)
btn=driver.find_element_by_id('su')
btn.click() # 点击
time.sleep(10)
driver.close()
1.5 执行js
from selenium import webdriver
import time
driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('http://127.0.0.1:8000/')
driver.implicitly_wait(10)
driver.execute_script("name='egon';") # 这里面写js代码
driver.execute_script("alert(name)") # 这里面写js代码
time.sleep(5)
# driver.close()
1.6 切换选项卡
import time
from selenium import webdriver
browser=webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.execute_script('window.open()')
print(browser.window_handles) #获取所有的选项卡
# browser.switch_to_window(browser.window_handles[1])
# browser.switch_to_window(browser.window_handles[1])
browser.switch_to.window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(5)
# browser.switch_to_window(browser.window_handles[0])
browser.switch_to.window(browser.window_handles[0])
browser.get('https://www.sina.com.cn')
browser.close()
1.7 模拟前进后退
import time
from selenium import webdriver
browser=webdriver.Chrome(executable_path='chromedriver.exe')
browser.get('https://www.baidu.com')
browser.get('https://www.taobao.com')
browser.get('http://www.sina.com.cn/')
browser.back()
time.sleep(3)
browser.forward()
browser.close()
1.8 异常处理
from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
browser = webdriver.Chrome()
try:
browser.get('http://www.baidu.com')
except Exception as e:
print(e)
finally:
browser.close()
1.9 selenium登录cnblogs获取cookie
#selenium登录cnblogs获取cookie
from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
import time
import json
browser = webdriver.Chrome(executable_path='chromedriver.exe')
browser.implicitly_wait(10)
#### 登录过程
# try:
# browser.get('http://www.cnblogs.com')
# submit_btn=browser.find_element_by_link_text('登录') # a标签的内容
# submit_btn.click()
#
# username=browser.find_element_by_id('mat-input-0')
# password=browser.find_element_by_id('mat-input-1')
# username.send_keys('616564099@qq.com')
# password.send_keys('1111')
# input('等会')
# sub_btn=browser.find_element_by_css_selector('body > app-root > mat-sidenav-container > mat-sidenav-content > div > div > app-sign-in > app-content-container > div > div > div > form > div > button > span.mat-button-wrapper')
# sub_btn.click()
#
# # 人工参与,滑动
# input('等会')
#
# # 获取到登录后的cookie
# print(browser.get_cookies())
#
# with open('cookie.json','w') as f:
# json.dump(browser.get_cookies(),f)
#
#
# except Exception as e:
# print(e)
# finally:
# browser.close()
### 不登录了,把cookie写入浏览器
# browser.get('http://www.cnblogs.com')
# with open('cookie.json','r') as f:
# cookie=json.load(f)
# time.sleep(5)
# for item in cookie: # 设置cookie必须用字典,cookie的json文件是列表,所以用循环往里放
# browser.add_cookie(item)
#
#
#
# browser.refresh() # 刷新页面
#
# time.sleep(5)
#
# browser.close()
1.10 抽屉半自动点赞
from selenium import webdriver
import json
import time
#### 登录过程
# bro=webdriver.Chrome(executable_path='chromedriver.exe')
# bro.implicitly_wait(10)
# bro.get('https://dig.chouti.com/')
# try:
# sub_btn=bro.find_element_by_id('login_btn')
# print(sub_btn)
#
# # sub_btn.click() # 报错
# bro.execute_script('arguments[0].click();',sub_btn)
#
# # username=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-body > div.form-item.login-item.clearfix.phone-item.mt24 > div.input-item.input-item-short.left.clearfix > input')
# username=bro.find_element_by_css_selector('div.input-item>input.login-phone')
# username.send_keys('18953675221')
# # password=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div.form-item.login-item.clearfix.mt24 > div')
# password = bro.find_element_by_css_selector('div.input-item>input.pwd-password-input')
# password.send_keys('lqz123')
#
# time.sleep(3)
# btn=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div:nth-child(4) > button')
#
# btn.click()
#
# input('等')
#
# with open('chouti.json','w') as f:
# json.dump(bro.get_cookies(),f)
#
#
#
#
# finally:
# bro.close()
import requests
bro=webdriver.Chrome(executable_path='chromedriver.exe')
bro.implicitly_wait(10)
bro.get('https://dig.chouti.com/')
# 把屏幕滑倒最底下
bro.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# bro.find_elements_by_css_selector('.link-item')
cookie={}
##从文件中读出cookie
with open('chouti.json','r') as f:
res=json.load(f)
for item in res:
cookie[item['name']]=item['value']
print(cookie) # requests能够使用的cookie
div= bro.find_element_by_class_name('link-con')
time.sleep(2)
header={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
div_list=div.find_elements_by_class_name('link-item')
for div in div_list:
article_id=div.get_attribute('data-id')
print(article_id)
# 使用requests发送请求
res=requests.post('https://dig.chouti.com/link/vote',data={'linkId': article_id},cookies=cookie,headers=header)
print(res.text)
bro.close()
2 打码平台使用
# 人工破解
# 图像识别模块---》数字,字母组合
# 验证码破解平台---》云打码,超级鹰
-给它一张图片---》结果返回 (收费的)
#!/usr/bin/env python
# coding:utf-8
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode('utf8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
if __name__ == '__main__':
chaojiying = Chaojiying_Client('306334678', 'lqz12345', '903641') # 用户中心>>软件ID 生成一个替换 96001
im = open('a.jpg', 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
print(chaojiying.PostPic(im, 1902)) # 1902 验证码类型 官方网站>>价格体系 3.4+版 print 后要加()
3 xpath使用
1 一门在html中查找数据的语言
2 记住的语法:
/ 取当前路径下的xx
// 取所有路径下的xx
. 当前路径
.. 上一层
@ 取属性
4 lxml解析模块提供的xpath
doc='''
<html>
<head>
<base href='http://example.com/' />
<title>Example website</title>
</head>
<body>
<div id='images'>
<a href='image1.html' name='sss'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
<a href='image2.html' name='lqz'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
<a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
<a href='image4.html' class='li'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
<a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
<a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
</div>
</body>
</html>
'''
from lxml import etree
# 传入要解析的内容
html=etree.HTML(doc)
# res=html.xpath('//body')
# print(res)
# 1 所有节点
# a=html.xpath('//*')
# 2 指定节点(结果为列表)
# a=html.xpath('//head')
# 3 子节点,子孙节点
# a=html.xpath('//div/a')
# a=html.xpath('//body//a') #无数据
# a=html.xpath('//body//a')
# 4 父节点
# a=html.xpath('//body//a[@href="image1.html"]/..')
# a=html.xpath('//body//a')
# a=html.xpath('//body//a[@href="image1.html"]')
# a=html.xpath('//body//a[1]/..')
# 也可以这样
# a=html.xpath('//body//a[1]/parent::*')
# a=html.xpath('//body//a[1]/parent::p')
# 5 属性匹配
# a=html.xpath('//a[@href="image1.html"]')
# a=html.xpath('//a[@name="sss"]')
# 6 文本获取 text()
# a=html.xpath('//a[@href="image1.html"]/text()')
# a=html.xpath('//a/text()')
# 7 属性获取
# a=html.xpath('//a/@href')
# a=html.xpath('//a[1]/@name')
# # 注意从1 开始取(不是从0)
# a=html.xpath('//body//a[2]/@href')
# 8 属性多值匹配
# a 标签有多个class类,直接匹配就不可以了,需要用contains
# a=html.xpath('//a[@class="li"]')
# a=html.xpath('//a[contains(@class,"li")]')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 9 多属性匹配
# a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')
# a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 10 按序选择
# a=html.xpath('//a[2]/text()')
# a=html.xpath('//a[2]/@href')
# a=html.xpath('//a[2]/@name')
# 取最后一个
# a=html.xpath('//a[last()]/@href')
# 位置小于3的
# a=html.xpath('//a[position()<3]/@href')
# 倒数第二个
# a=html.xpath('//a[last()-2]/@href')
# 11 节点轴选择
# ancestor:祖先节点
# 使用了* 获取所有祖先节点
# a=html.xpath('//a/ancestor::*')
# # 获取祖先节点中的div
# a=html.xpath('//a/ancestor::div')
# attribute:属性值
# a=html.xpath('//a[1]/attribute::*')
# child:直接子节点
# a=html.xpath('//a[1]/child::*')
# a=html.xpath('//a[1]/child::img/@src')
# descendant:所有子孙节点
# a=html.xpath('//a[6]/descendant::*')
# following:当前节点之后所有节点
# a=html.xpath('//a[1]/following::*')
# a=html.xpath('//a[1]/following::*[1]/@href')
# following-sibling:当前节点之后同级节点
# a=html.xpath('//a[1]/following-sibling::*')
# a=html.xpath('//a[1]/following-sibling::a')
# a=html.xpath('//a[1]/following-sibling::*[2]/text()')
# a=html.xpath('//a[1]/following-sibling::*[2]/@href')
print(a)

浙公网安备 33010602011771号