day04

内容概要

selenium 登录cnblongs
抽屉半自动点赞
xpath使用
selenium动作链
自动登录12306
打码平台使用
使用打码平台自动登录
使用selenium爬取京东信息
scrapy介绍

selenium登录cnblongs

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

"""
r"D:\py3.8\Tools\chromedriver_win32\chromedriver.exe"
"""
options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")
s = Service(r"D:\py3.8\Tools\chromedriver_win32\chromedriver.exe")
bro = webdriver.Chrome(service=s, options=options)
bro.implicitly_wait(10)  # 等待
bro.get("https://www.cnblogs.com/")
bro.find_element(By.LINK_TEXT, "登录").click()
bro.find_element(By.CSS_SELECTOR,
                 "body > app-root > app-sign-in-layout > div > div > app-sign-in > app-content-container > div > div > div > div > app-external-sign-in-providers > div > button:nth-child(2) > span.mat-button-wrapper > img").click()
"""
<iframe frameborder="0" width="407" height="331" id="ptlogin_iframe" name="ptlogin_iframe" src="https://xui.ptlogin2.qq.com/cgi-bin/xlogin?appid=716027609&amp;daid=383&amp;style=33&amp;login_text=%E7%99%BB%E5%BD%95&amp;hide_title_bar=1&amp;hide_border=1&amp;target=self&amp;s_url=https%3A%2F%2Fgraph.qq.com%2Foauth2.0%2Flogin_jump&amp;pt_3rd_aid=101880508&amp;pt_feedback_link=https%3A%2F%2Fsupport.qq.com%2Fproducts%2F77942%3FcustomInfo%3D.appid101880508&amp;theme=2&amp;verify_theme="></iframe>
<span class="qrlogin_img_out" onmouseover="pt.plogin.showQrTips();" onmouseout="pt.plogin.hideQrTips();"></span>
<a class="link" hidefocus="true" id="switcher_plogin" href="javascript:void(0);" tabindex="8">密码登录</a>
"""
res1 = bro.current_window_handle  # 获取当前页的句柄
hels_list = bro.window_handles  # 获取所有的
for i in hels_list:
    if i != res1:
        bro.switch_to.window(i)
        bro.maximize_window()

bro.switch_to.frame("ptlogin_iframe")
#
res = bro.find_element(By.ID, "switcher_plogin")
bro.execute_script("arguments[0].click()", res)
bro.find_element(By.ID, "u").send_keys("1548346849")
bro.find_element(By.ID, "p").send_keys("1548346849li")
bro.find_element(By.ID, "p").send_keys(Keys.ENTER)

time.sleep(3)
bro.close()

抽屉半自动点赞

使用selenium 半自动登录---》取到cookie
使用requests模块，解析出点赞的请求地址---》模拟发送请求---》携带cookie

import json
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

"""
r"D:\py3.8\Tools\chromedriver_win32\chromedriver.exe"
"""
# s = Service(r"D:\py3.8\Tools\chromedriver_win32\chromedriver.exe")
# bro = webdriver.Chrome(service=s)
# bro.implicitly_wait(10)
# bro.get("https://dig.chouti.com/")
#
# res = bro.find_element(By.LINK_TEXT, "登录")
# bro.execute_script("arguments[0].click()", res)
#
# bro.find_element(By.NAME, "phone").send_keys("")
# bro.find_element(By.NAME, "password").send_keys("")
#
# bro.find_element(By.XPATH, "/html/body/div[4]/div/div[4]/div[4]/button").click()
# input(":")
# print("继续了哦")
# bro.refresh()
# with open("cookies.json", "w", encoding="utf8") as f:
#     json.dump(bro.get_cookies(), f)
#
#
# bro.close()

import requests

# 写一个头 , 不带头获取不到数据, 防爬措施

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
# 获取文章的路由
url = 'https://dig.chouti.com/top/24hr?_=1679311149916'
# 转换成 json格式 {code:200,  data:[{id:xx, name:xx}]}
res = requests.get(url=url, headers=headers).json()

# 点赞的接
# 点赞的数据格式
payload = {'linkId': '38116195'}
# 获取cookies  我们已经用selenium获取到cookie了但是是列表的格式, requests用不来
# 得处理一下, 我们只用获取到 文件里面的cookies的 name和values  凭借成name:value
with open("cookies.json", "r", encoding="utf8") as f:
    data = json.load(f)

    # 获取id 发送请求
cookies = {i["name"]: i["value"] for i in data}

request_cookies = {}
for cookie in data:
    request_cookies[cookie['name']] = cookie['value']
# print(type(cookies))
# for i in res.get('data'):
# print(i.get("id"))

# res1 = requests.post(url='https://dig.chouti.com/link/vote', headers=headers, data={"linkId": "38116629"},
#                      cookies=cookies)
# res1 = requests.post(url='https://dig.chouti.com/link/vote', headers=headers,cookies=cookies)
# print(res1.text)

Xpaht的使用

每个解析器，都会有自己的查找方法

bs4 find和find_all
selenium find_element和 find_elements
lxml 也是一个解析器支持xpath和css

这些解析器，基本上都会支持两种统一的 css和xpath

xpath是什么

XPath即为XML路径语言(XML Path Language),他是一个用来确定XML文档某部分位置的语言

/    从当前路径下找
/div  从当前路径下开始找
//    递归查找， 子子孙孙
//div   递归查找 所有的div
@      取属性
.        当前
..      上一层

# 每个解析器，都会有自己的查找方法
doc = '''
<html>
 <head>
  <base href='http://example.com/' />
  <title>Example website</title>
 </head>
 <body>
  <div id='images'>
   <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg'/></a>
   <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
   <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
   <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
   <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
  </div>
 </body>
</html>
'''
from lxml import etree

# html = etree.HTML(doc)
# html=etree.parse('search.html',etree.HTMLParser())
# 1 所有节点
# a=html.xpath('//*')
# 2 指定节点（结果为列表）
# a=html.xpath('//head')
# 3 子节点，子孙节点
# a=html.xpath('//div/a')
# a=html.xpath('//body/a') #无数据
# a = html.xpath('//body//a')
# 4 父节点
# a=html.xpath('/html/body')
# a=html.xpath('//body//a[@href="image1.html"]/..')   # 属性a[@href="image1.html"]    .. 表示上一层
# a=html.xpath('//body//a[1]/..')   # 从1 开始的

# 也可以这样
# a=html.xpath('//body//a[1]/parent::*')
# a=html.xpath('//body//a[1]/parent::p')
# a=html.xpath('//body//a[1]/parent::div')
# 5 属性匹配
# a=html.xpath('//body//a[@href="image1.html"]')

# 6 文本获取  /text()
# a=html.xpath('//body//a[@href="image1.html"]/text()')

# 7 属性获取
# a=html.xpath('//body//a/@href')
# # 注意从1 开始取（不是从0）
# a=html.xpath('//body//a[1]/@href')
# 8 属性多值匹配
#  a 标签有多个class类，直接匹配就不可以了，需要用contains
# a=html.xpath('//body//a[@class="li"]')  # 这个取不到
# a=html.xpath('//body//a[contains(@class,"li")]')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 9 多属性匹配
# a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')
# a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')

# 10 按序选择
# a=html.xpath('//a[2]/text()')
# a=html.xpath('//a[2]/@href')
# 取最后一个
# a=html.xpath('//a[last()]/@href')
# 位置小于3的
# a=html.xpath('//a[position()<3]/@href')
# 倒数第二个
# a=html.xpath('//a[last()-2]/@href')


# 11 节点轴选择
# ancestor：祖先节点
# 使用了* 获取所有祖先节点
# a=html.xpath('//a/ancestor::*')
# # 获取祖先节点中的div
# a=html.xpath('//a/ancestor::div')
# attribute：属性值
# a=html.xpath('//a[1]/attribute::*')
# child：直接子节点
# a=html.xpath('//a[1]/child::*')
# descendant：所有子孙节点
# a=html.xpath('//a[6]/descendant::*')
# following:当前节点之后所有节点
# a=html.xpath('//a[1]/following::*')
# a=html.xpath('//a[1]/following::*[1]/@href')
# following-sibling:当前节点之后同级节点
# a=html.xpath('//a[1]/following-sibling::*')
# a=html.xpath('//a[1]/following-sibling::a')
# a=html.xpath('//a[1]/following-sibling::*[2]')
# a=html.xpath('//a[1]/following-sibling::*[2]/@href')

# print(a)




import requests

res=requests.get('https://www.runoob.com/xpath/xpath-syntax.html')
print(res.text)
html = etree.HTML(res.text)
a=html.xpath('//*[@id="content"]/h2[2]/text()')
print(a)

selenium动作链

人可以滑动某些标签

网站中有些按住鼠标，滑动的效果

滑动验证码

两种形式

形式一

actions = ActionChains(bro)  # 拿到动作链对象
actions.drag_and_drop(sourse, target)  # 把动作放到动作链中，准备串行执行
actions.perform()

方式二

ActionChains(bro).click_add_hold(sourse).perform()
distanc=target.location["x"]-sourse.location["x"]
track=0
while track < distance:
    ActionChains(bro).move_by_offset(xoffset=2, yoffset=0).perform()
    track +=2

动作链案例

方式一

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

s = Service(r"D:\py3.8\Tools\chromedriver_win32\chromedriver.exe")
bro = webdriver.Chrome(service=s)
bro.implicitly_wait(10)
bro.get("http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable")
bro.switch_to.frame("iframeResult")
sourse = bro.find_element(By.ID, "draggable")
res = bro.find_element(By.ID, "droppable")
res1 = ActionChains(bro)
res1.drag_and_drop(sourse, res)
res1.perform()
input()

bro.close()

方式二

import time

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

s = Service(r"D:\py3.8\Tools\chromedriver_win32\chromedriver.exe")
bro = webdriver.Chrome(service=s)
bro.implicitly_wait(10)
bro.get("http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable")
bro.switch_to.frame("iframeResult")
sourse = bro.find_element(By.ID, "draggable")
res = bro.find_element(By.ID, "droppable")
res1 = ActionChains(bro).click_and_hold(sourse).perform()  # 现在只是点住了sourse
res_cont = res.location["x"] - sourse.location["x"]
track = 0
while track < res_cont:
    ActionChains(bro).move_by_offset(xoffset=10, yoffset=0).perform()  # 开始点住 x移动
    track += 10
input(":")

bro.close()

自动登录12306

import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")  # 防爬 防检测为selenium

s = Service(r"D:\py3.8\Tools\chromedriver_win32\chromedriver.exe")
bro = webdriver.Chrome(service=s, options=options)
bro.maximize_window()
bro.implicitly_wait(10)
bro.get("https://kyfw.12306.cn/otn/resources/login.html")
bro.find_element(By.ID, "J-userName").send_keys("")
bro.find_element(By.ID, "J-password").send_keys("")
bro.find_element(By.LINK_TEXT, "立即登录").click()

sourse = bro.find_element(By.XPATH, '/html/body/div[1]/div[4]/div[2]/div[2]/div/div/div[2]/div/div[1]/span')
res = ActionChains(bro).click_and_hold(sourse).perform()
track = 0
while track < 300:
    ActionChains(bro).move_by_offset(xoffset=10, yoffset=0).perform()
    track += 10

time.sleep(10)

bro.close()

打码平台使用

登录网站，会有些验证码，可以借助第三方的打码平台，破解验证码，只需花钱解决

免费的:纯数字，纯字母的----》python有免费模块破解失败率高

云打码，超级鹰


# 云打码：https://zhuce.jfbym.com/price/


# 价格体系：破解什么验证码，需要多少钱
	http://www.chaojiying.com/price.html

抠图

使用selenium截图然后使用pillow抠图

import time

from selenium import webdriver
from PIL import Image
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

from selenium.webdriver.chrome.options import Options

s = Service(r"D:\py3.8\Tools\chromedriver_win32\chromedriver.exe")
bor = webdriver.Chrome(service=s)
bor.get("http://www.chaojiying.com/apiuser/login/")
bor.implicitly_wait(10)
bor.maximize_window()
bor.save_screenshot("main.png")
res = bor.find_element(By.XPATH, "/html/body/div[3]/div/div[3]/div[1]/form/div/img")
res_locat = res.location
size = res.size
print(res_locat)
print(size)
img_tu = (int(res_locat["x"]),
          int(res_locat["y"]),
          int(res_locat["x"])+int(size["width"]),
          int(res_locat["y"])+int(size["height"]))
print(img_tu)
img = Image.open("./main.png")
fram = img.crop(img_tu)
fram.save("code.png")
time.sleep(2)
bor.close()

注意：自己的电脑的缩放，

如果不是100%，得自己取调图片的缩放，要不然距离截不到，按125%截的图，pillow按100%截取的图。

爬取京东的商品

"""
    商品图片地址：%s
    商品地址：%s
    商品名字：%s
    商品价格：%s
    商品评论数：%s

"""
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options


def get_goods(bro):
    bro.execute_script('scrollTo(0,5000)')
    li_list = bro.find_elements(By.CLASS_NAME, 'gl-item')
    for li in li_list:
        try:
            img_url = li.find_element(By.XPATH, '//*[@id="J_goodsList"]/ul/li[1]/div/div[1]/a/img').get_attribute("src")
            res = li.find_element(By.XPATH, '//*[@id="J_goodsList"]/ul/li[1]/div/div[1]/a')
            goods_url = res.get_attribute("href")
            goods_name = li.find_element(By.XPATH, '//*[@id="J_goodsList"]/ul/li[1]/div/div[3]/a/em').text
            goods_price = li.find_element(By.XPATH, '//*[@id="J_goodsList"]/ul/li[1]/div/div[2]/strong/i').text
            goods_commit = li.find_element(By.CSS_SELECTOR, ".p-commit a").text
        except Exception as e:
            continue

        print("""
            商品图片地址：%s
            商品地址：%s
            商品名字：%s
            商品价格：%s
            商品评论数：%s
    
        """ % (img_url, goods_url, goods_name, goods_price, goods_commit))
    bro.find_element(By.XPATH, '//*[@id="J_bottomPage"]/span[1]/a[9]').click()
    get_goods(bro)


try:
    s = Service(r"D:\py3.8\Tools\chromedriver_win32\chromedriver.exe")
    bro = webdriver.Chrome(service=s)
    bro.implicitly_wait(10)
    bro.get("https://www.jd.com/")
    int_put = bro.find_element(By.ID, "key")
    int_put.send_keys("茅台")
    int_put.send_keys(Keys.ENTER)
    get_goods(bro)
except Exception as e:
    print(e)
finally:
    bro.close()

scrapy介绍

requsets  bs4 selenium  模块

框架 ：django ，scrapy--->专门做爬虫的框架，爬虫界的django，大而全，爬虫有的东西，它都自带

安装 （win看人品，linux，mac一点问题没有）

-pip3.8 install  scrapy

-装不上，基本上是因为twisted装不了，单独装
    1、pip3 install wheel #安装后，便支持通过wheel文件安装软件，wheel文件官网：https://www.lfd.uci.edu/~gohlke/pythonlibs
    3、pip3 install lxml
    4、pip3 install pyopenssl
    5、下载并安装pywin32：https://sourceforge.net/projects/pywin32/files/pywin32/
    6、下载twisted的wheel文件：http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
    7、执行pip3 install 下载目录\Twisted-17.9.0-cp36-cp36m-win_amd64.whl
    8、pip3 install scrapy

架构分析

爬虫：spiders(自己定义的，可以有很多)，定义爬取的地址，解析规则

引擎：engine---》控制整个框架数据的流动，大总管

调度器：scheduler--》要爬取的 request对象，放在里面排队

下载中间件:DownloaderMiddleware--->处理请求对象，处理响应对象

下载器:Downloader--->负责真正的下载，效率很高，基于twisted的高并发模型之上

爬虫中间件：spiderMiddleware---->处理engine和爬虫直接的(用的少)

管道：piplines--->负责存储数据

创建出scrapy项目

scrapy startproject firstscrapy  # 创建项目
scrapy genspider 名字 网址        # 创建爬虫   等同于 创建app

# pycharm打开

posted @ 2023-03-20 23:26 可否阅读(60) 评论(0) 收藏举报

可否