python爬虫---selenium基本使用，无界面浏览器，selenium其它用法，selenium登录cnblogs获取cookie，抽屉半自动点赞，爬虫案例

selenium基本使用

由于requests不能执行js，有的页面内容，我们在浏览器中可以看到，但是请求下来没有。selenium模块：模拟操作浏览器，完成人的行为。

selenium最初是一个自动化测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题。

selenium本质是通过驱动浏览器，完全模拟浏览器的操作，比如跳转、输入、点击、下拉等，来拿到网页渲染之后的结果，可支持多种浏览器

安装

pip install selenium

使用

-由于是驱动浏览器：需要确定好驱动哪个浏览器(ie,火狐，谷歌(推荐))

-下载相应的驱动

下载地址：https://registry.npmmirror.com/binary.html?path=chromedriver/

我这里下载的是chromedriver.exe

基本使用

from selenium import webdriver
import time
# 下载驱动谷歌浏览器，对应好版本，放在项目目录下
# 第一步：等同于双击谷歌浏览器
bro=webdriver.Chrome(executable_path='./chromedriver.exe')
# bro=webdriver.Chrome() # 不写路径，要放到项目路径或环境变量中
# 第二步：在地址栏输入地址
bro.get('http://www.baidu.com')
time.sleep(2)  # 模拟耗时的操作
# 第三步：关闭浏览器
bro.close()  # 关闭标签
# bro.quit()  # 关闭浏览器

模拟登录百度

import time
from selenium.webdriver.common.by import By
from selenium import webdriver

bro=webdriver.Chrome(executable_path='./chromedriver.exe')
bro.implicitly_wait(10)  # 隐士等待，无论找页面中那个标签，如果找不到，会等待最多10s钟
bro.get('https://www.baidu.com/')
# 1 根据标签id号，获取标签
btn=bro.find_element(by=By.ID,value='s-top-loginbtn')
# 2 根据文字找标签：a标签的文字
# btn = bro.find_element(by=By.LINK_TEXT, value='登录')
# 点击一下按钮
btn.click()
# 用户名，密码输入框
username = bro.find_element(by=By.ID, value='TANGRAM__PSP_11__userName')
password = bro.find_element(by=By.ID, value='TANGRAM__PSP_11__password')
# 写入文字
username.send_keys('616564199@qq.com')
time.sleep(1)
password.send_keys('lqz123')
# 输入账号密码后点击登录按钮
btn_login=bro.find_element(by=By.ID,value='TANGRAM__PSP_11__submit')
btn_login.click()
time.sleep(8)
bro.close()  # 关闭标签

无界面浏览器

不显示的打开浏览器的图形化界面，还能获取数据.

import time

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options=Options()
chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
driver=webdriver.Chrome(options=chrome_options)
driver.get('https://www.cnblogs.com/')
print(driver.page_source)  # 当前页面的内容(html内容)
time.sleep(2)
driver.close()

selenium其它用法

3.1 获取位置属性大小，文本

import time
import base64

from selenium import webdriver
from selenium.webdriver.common.by import By
bro=webdriver.Chrome()
bro.get('https://kyfw.12306.cn/otn/resources/login.html')
bro.implicitly_wait(10)
btn=bro.find_element(By.LINK_TEXT,'扫码登录')
btn.click()
time.sleep(1)
img=bro.find_element(By.ID,'J-qrImg')
print(img.location)  # {'x': 836, 'y': 254}
print(img.size)  # {'height': 158, 'width': 158}
print(img.id)  # 不是标签的id号
print(img.tag_name)  # 是标签的名字
s=img.get_attribute('src')
with open('code.png','wb') as f:
    res=base64.b64decode(s.split(',')[-1])
    f.write(res)
bro.close()

3.2 等待元素被加载

# 程序操作页面非常快，所以在取每个标签的时候，标签可能没有加载号，需要设置等待
	-显示等待：不需要了解
	-隐士等待：bro.implicitly_wait(10)

3.3 元素操作

搜索标签

find_element：找第一个
find_elements：找所有

# 1、find_element_by_id   # 根据id
# 2、find_element_by_link_text # 根据a标签的文字
# 3、find_element_by_partial_link_text # 根据a标签的文字模糊匹配
# 4、find_element_by_tag_name    # 根据标签名
# 5、find_element_by_class_name  # 根据类名
# 6、find_element_by_name        # 根据name属性
# 7、find_element_by_css_selector  # css选择器
# 8、find_element_by_xpath         # xpath

点击

标签.click()

写入文字

标签.send_keys()

清空

标签.clear()

滑动屏幕到最底部（滑动加载）

bro.execute_script('scrollTo(0,document.body.scrollHeight)')

3.4 执行js代码

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
bro=webdriver.Chrome()

bro.get('https://www.pearvideo.com/category_8')
time.sleep(1)
# 这里面写js代码
bro.execute_script('scrollTo(0,document.body.scrollHeight)')
bro.execute_script("alert('asdasd')")

time.sleep(5)
bro.close()

# 执行js用途：1 普通滑屏，打开新标签   2 可以执行js代码，别人网站的变量，函数，都可以拿到并执行的

3.5 切换选项卡

import time
from selenium import webdriver
from selenium.webdriver.common.by import By

bro=webdriver.Chrome()
bro.get('https://www.pearvideo.com/category_8')

# 打开新的选项卡
bro.execute_script('window.open()')
# 切换选项卡
bro.switch_to.window(bro.window_handles[1])
bro.get('http://www.baidu.com')

time.sleep(2)
bro.switch_to.window(bro.window_handles[0])
time.sleep(2)
bro.quit()

3.6 浏览器前进后退

import time
from selenium import webdriver

bro=webdriver.Chrome()

bro.get('https://www.pearvideo.com/category_8')
time.sleep(2)
bro.get('https://www.baidu.com')
# 后退
bro.back()
time.sleep(2)
# 前进
bro.forward()
time.sleep(2)
bro.quit()

3.7 异常处理

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
bro=webdriver.Chrome()

try:
    bro.get('https://www.pearvideo.com/category_8')
    time.sleep(1)
    bro.get('https://www.baidu.com')

    bro.back() # 后退
    raise Exception('报错了')
    time.sleep(1)
    bro.forward()
except Exception as e:
    print(e)
finally:
    bro.quit()

抽屉半自动点赞

import requests
import json
from bs4 import BeautifulSoup
from selenium import webdriver

with open('cookies.json','r',encoding='utf-8')as f :
    cookies=json.load(f)
cookies = '; '.join(item for item in [item["name"] + "=" + item["value"] for item in cookies])
cookies = dict(cookies_are=cookies)
header = {
    "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0",
    "Referer":"https://dig.chouti.com/",
    }
from selenium.webdriver.chrome.options import Options
chrome_options = Options()

chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败

driver=webdriver.Chrome(options=chrome_options)
driver.get('https://dig.chouti.com/')
soup = BeautifulSoup(driver.page_source, 'html.parser')
vote_list=soup.find_all(class_="link-title link-statistics")
for item in vote_list:
    id=item.attrs['data-id']
    requests.post('https://dig.chouti.com/link/vote',headers=header,cookies=cookies,data={"linkId":id})

selenium登录cnblogs获取cookie

登录获取cookies到本地

# 提前登录获取cookies到本地
import time
import json
from selenium import webdriver
from selenium.webdriver.common.by import By

bro=webdriver.Chrome()
bro.implicitly_wait(10)

### 登录成功
try:
    bro.get('https://www.cnblogs.com/')
    # 通过登录这个a标签的文本锁定登录的文字
    btn_login=bro.find_element(By.LINK_TEXT,'登录')
    # 点击登录
    btn_login.click()
    time.sleep(5)
    # 通过id锁定用户名和密码的输入框
    username=bro.find_element(By.ID,'mat-input-0')
    password=bro.find_element(By.ID,'mat-input-1')
    time.sleep(2)
    # 写入用户名和密码
    username.send_keys('早安_1207')
    password.send_keys('kuci519332+++')
    # 获取登录页的登录按钮
    btn=bro.find_element(By.CSS_SELECTOR,'body > app-root > app-sign-in-layout > div > div > app-sign-in > app-content-container > div > div > div > form > div > button > span.mat-button-wrapper')
    # 点击登录按钮
    btn.click()
    # 登入成功后获取cookies
    cookies=bro.get_cookies()
    print(cookies)
    time.sleep(2)
    # 把获取到的cookies保存到本地
    with open('cookies.json','w',encoding='utf-8') as f:
        json.dump(cookies,f)
except Exception as e:
    print(e)
finally:
    bro.quit()

打开页面，把本地的cookies写入到浏览器

try:
    bro.get('https://www.cnblogs.com/')
    time.sleep(2)
    # 把cookie写入浏览器，刷新一下，就登录了
    with open('cookies.json','r',encoding='utf-8') as f:
        res=json.load(f)
    for item in res:
        bro.add_cookie(item)

    # 刷新浏览器
    bro.refresh()
    time.sleep(2)

except Exception as e:
    print(e)

finally:
    bro.quit()

几个爬虫案例

# 2 爬红楼梦小说

#http://www.shicimingju.com/book/hongloumeng.html

# import requests
#
# from bs4 import BeautifulSoup
# ret=requests.get('https://www.shicimingju.com/book/hongloumeng.html')
# ret.encoding='utf-8'
#
# soup=BeautifulSoup(ret.text,'lxml')
# li_list=soup.find(class_='book-mulu').find('ul').find_all('li')
# with open('hlm.txt','w',encoding='utf-8') as f:
#     for li in li_list:
#         content=li.find('a').text
#         url='https://www.shicimingju.com'+li.find('a').get('href')
#         f.write(content)
#         f.write('\n')
#         res_content=requests.get(url)
#         res_content.encoding = 'utf-8'
#         res_content.encoding=res_content.apparent_encoding
#         soup2=BeautifulSoup(res_content.text,'lxml')
#         content_detail=soup2.find(class_='chapter_content').text
#         f.write(content_detail)
#         f.write('\n')
#         print(content,'写入了')


# 3 爬肯德基门店

# http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword
import requests

header = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
data = {
    'cname': '',
    'pid': 20,
    'keyword': '浦东',
    'pageIndex': 1,
    'pageSize': 10
}
ret = requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword', data=data, headers=header)
print(ret.json())

posted @ 2022-08-02 21:04 早安_1207 阅读(492) 评论(0) 收藏举报

刷新页面返回顶部

早安

不要让自己阻止自己过自己想过的生活

python爬虫---selenium基本使用，无界面浏览器，selenium其它用法，selenium登录cnblogs获取cookie，抽屉半自动点赞，爬虫案例

selenium基本使用

安装

使用

无界面浏览器

selenium其它用法

3.1 获取位置属性大小，文本

3.2 等待元素被加载

3.3 元素操作

搜索标签

点击

写入文字

清空

滑动屏幕到最底部（滑动加载）

3.4 执行js代码

3.5 切换选项卡

3.6 浏览器前进后退

3.7 异常处理

抽屉半自动点赞

selenium登录cnblogs获取cookie

几个爬虫案例

公告

早安

不要让自己阻止自己过自己想过的生活

python爬虫---selenium基本使用，无界面浏览器，selenium其它用法，selenium登录cnblogs获取cookie， 抽屉半自动点赞，爬虫案例

selenium基本使用

安装

使用

无界面浏览器

selenium其它用法

3.1 获取位置属性大小，文本

3.2 等待元素被加载

3.3 元素操作

搜索标签

点击

写入文字

清空

滑动屏幕到最底部（滑动加载）

3.4 执行js代码

3.5 切换选项卡

3.6 浏览器前进后退

3.7 异常处理

抽屉半自动点赞

selenium登录cnblogs获取cookie

几个爬虫案例

公告

python爬虫---selenium基本使用，无界面浏览器，selenium其它用法，selenium登录cnblogs获取cookie，抽屉半自动点赞，爬虫案例