39、反爬之验证码
图片验证码
1. 介绍
验证码(CAPTCHA)是“Completely Automated Public Turing test to tell Computers and Humans Apart”(全自动区分计算机和人类的图灵测试)的缩写,是一种区分用户是计算机还是人的公共全自动程序。
防止恶意破解密码、刷票、论坛灌水、刷页。有效防止某个黑客对某一个特定注册用户用特定程序暴力破解方式进行不断的登录尝试,实际上使用验证码是现在很多网站通行的方式
2. 使用场景
- 注册
- 登录
- 频繁发送请求时,服务器弹出验证码进行验证
3. 处理方案
-
手动输入(input)
这种方法仅限于登录一次就可持续使用的情况 -
图像识别引擎
-
Tesseract
- Tesseract,一款由HP实验室开发由Google维护的开源OCR引擎,特点是开源,免费,支持多语言,多平台。
-
TensorFlow
- TensorFlow是一个开源软件库,用于各种感知和语言理解任务的机器学习,目前被50个团队用于研究和生产许多Google商业产品,如语音识别、Gmail、Google 相册和搜索
-
PyTorch
-
-
打码平台
爬虫常用的验证码解决方案
4. 实战操作
#!/usr/bin/env python
# coding:utf-8
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode('utf8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
if __name__ == '__main__':
chaojiying = Chaojiying_Client('超级鹰用户名', '超级鹰用户名的密码', '96001') #用户中心>>软件ID 生成一个替换 96001
im = open('a.jpg', 'rb').read() #本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
print chaojiying.PostPic(im, 1902) #1902 验证码类型 官方网站>>价格体系 3.4+版 print 后要加()
点选验证码
滑动验证码
# 保存图片
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from urllib.request import urlretrieve
import requests
from fake_useragent import UserAgent
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
def test1():
url = 'https://www.sf-express.com/cn/sc/dynamic_function/waybill/#search/bill-number/SF1045585271138'
browser = webdriver.Chrome(executable_path='./chromedriver')
wait = WebDriverWait(browser,30)
browser.get(url)
try:
wait.until(EC.presence_of_element_located((By.ID,'tcaptcha_popup')))
browser.switch_to.frame('tcaptcha_popup')
img = browser.find_element_by_id('slideBkg')
img_src = img.get_attribute('src')[:-1]
print(img_src)
save_img(f'{img_src}1','cpt1.png')
save_img(f'{img_src}2','cpt2.png')
browser.close()
except Exception as e:
browser.close()
def save_img(img,file_name):
resp = requests.get(f'{img}',headers =headers)
with open(f'./imgs/{file_name}','wb') as f:
f.write(resp.content)
if __name__ == '__main__':
test1()
# 获取长度
import cv2
#b pip install streamlit
import streamlit as st
import numpy as np
def get_long():
# x = st.slider('Select a value',max_value=500)
# y = st.slider('Select a value2',max_value=1000)
# 最小阈值100,最大阈值500
img1 = cv2.imread('./imgs/cpt1.png', 0)
canny1 = cv2.Canny(img1, 300, 800)
cv2.imwrite('./imgs/tcpt1.png',canny1)
# st.button('save',on_click=lambda : cv2.imwrite('./imgs/tcpt1.png',canny1))
'''
1. Canny 使用了滞后阈值,滞后阈值需要两个阈值(高阈值和低阈值)。
2. 如果某一像素位置的幅值超过 高 阈值, 该像素被保留为边缘像素。
3.如果某一像素位置的幅值小于 低 阈值, 该像素被排除。
4.如果某一像素位置的幅值在两个阈值之间,该像素仅仅在连接到一个高于 高 阈值的像素时被保留。
'''
# st.image(canny1)
img2 = cv2.imread('./imgs/cpt2.png', 0)
canny2 = cv2.Canny(img2, 300, 800)
cv2.imwrite('./imgs/tcpt2.png',canny2)
small = cv2.imread('./imgs/tcpt2.png')
big = cv2.imread('./imgs/tcpt1.png')
# 调整显示大小
target_temp = cv2.resize(big, (340, 195))
target_temp = cv2.copyMakeBorder(target_temp, 5, 5, 5, 5, cv2.BORDER_CONSTANT, value=[255, 255, 255])
template_temp = cv2.resize(small, (68, 68))
template_temp = cv2.copyMakeBorder(template_temp, 5, 5, 5, 5, cv2.BORDER_CONSTANT, value=[255, 255, 255])
st.image(template_temp)
st.image(target_temp)
imgs = []
imgs.append(target_temp)
imgs.append(template_temp)
theight, twidth = small.shape[:2]
# 匹配
result = cv2.matchTemplate(big,small,cv2.TM_CCOEFF_NORMED)
st.write(result)
st.write('-------------------------------------')
st.write(small)
# 归一化
# cv2.normalize( result ,result, 0, 1, cv2.NORM_MINMAX, -1 )
# st.write(result)
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
st.write(min_val,max_val,min_loc,max_loc)
return max_loc[0]/big.shape[:2][1]*340-12*2
# 匹配后结果画圈
tmp = cv2.rectangle(big,max_loc,(max_loc[0]+twidth,max_loc[1]+theight),(0,0,255),2)
st.image(tmp)
target_temp_n = cv2.resize(big, (340, 195))
target_temp_n = cv2.copyMakeBorder(target_temp_n, 5, 5, 5, 5, cv2.BORDER_CONSTANT, value=[0, 0, 255])
st.image(target_temp_n)
# imgs.append(target_temp_n)
# import numpy as np
# imstack = np.hstack(imgs)
# cv2.namedWindow('stack'+str(max_loc))
# cv2.imshow('stack'+str(max_loc), imstack)
# cv2.waitKey(0)
# cv2.destroyAllWindows()
if __name__ == '__main__':
get_long()
# 编辑移到速度
def get_track(distance):
'''
根据偏移量获取移动轨迹
:param distance:偏移量
return 移动轨迹
'''
# 移动轨迹
track = []
# 当前位移
current = 0
# 减速阀值
mid = distance*4/5
# 计算间隔
t = 0.2
# 初始速度
v = 0
while current < distance:
if current <mid:
# 加速度为正2
a = 2
else:
# 加速度为负3
a = -3
# 初始速度
v0 = v
# 当前速度v = v0 + at
v = v0 + a * t
# 移动距离 x = v0t + 1/2 *a *t^2
move = v0 + 1/2 * a * t * t
# 当前位移
current += move
# 加入轨迹
track.append(round(move))
return track
if __name__ == '__main__':
print(get_track(246))
# 移动
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import requests
from track import get_track
from get_long import get_long
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
def test1():
url = 'https://www.sf-express.com/cn/sc/dynamic_function/waybill/#search/bill-number/SF1045585271138'
browser = webdriver.Chrome(executable_path='./chromedriver')
wait = WebDriverWait(browser,30)
browser.get(url)
try:
wait.until(EC.presence_of_element_located((By.ID,'tcaptcha_popup')))
browser.switch_to.frame('tcaptcha_popup')
img = browser.find_element_by_id('slideBkg')
img_src = img.get_attribute('src')[:-1]
print(img_src)
save_img(f'{img_src}1','cpt1.png')
save_img(f'{img_src}2','cpt2.png')
num = get_long()
ta = get_track(num)
# wait.until(EC.presence_of_element_located((By.ID,'tcaptcha_popup')))
block = browser.find_element_by_id('tcaptcha_drag_button')
# 鼠标点按钮
webdriver.ActionChains(browser).click_and_hold(block).perform()
# 拖动
for t in ta:
webdriver.ActionChains(browser).move_by_offset(xoffset=t, yoffset=0).perform()
# 释放鼠标
webdriver.ActionChains(browser).release().perform()
browser.close()
except Exception as e:
browser.close()
def save_img(img,file_name):
resp = requests.get(f'{img}',headers =headers)
with open(f'./imgs/{file_name}','wb') as f:
f.write(resp.content)
if __name__ == '__main__':
test1()