验证码识别
为了学习验证码的识别,需要图形识别技术,于是安装了tesserocr,安装的过程中有事没把文档看完,但是以为自己已经安装好了,结果没配环境变量tesserdata-prefix,呵呵,白白花了好多时间去搜问题,不专心的后果可见一斑。
识别测试一个image:
- import tesserocr
- from PIL import Image# PIL python image library
- image=Image.open("code.jpg")
- result=tesserocr.image_to_text(image)
- print(result)
也可以用tesserocr.file_to_text("image.png")不过识别效果较差。
当验证码中有许多干扰线条的时候,可以用转灰度和二值化来处理图像,然后再识别。
- 转灰度:
- image=image.convert("L")#参数L选择灰度
- image.show()
- 二值化:
- image.convert("1")#参数1选择二值化
- image.show()
我们还可以指定二值化的阈值。但们不能直接转化原图,先要转化为灰度图像,然后再指定二值化阈值。
image=image.convert("L")
threshold=120
table=[]
for i in range(256):
if i<threshold:
table.append(0)
else:
table.append(1)
image=image.point(table,"1")
image.show()
第二节极验滑动验证码:
from selenium import webdriver
from selenium.webdriver import ActionChains#动作链
browser = webdriver.Chrome()
url = "https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable"
browser.get(url)
browser.switch_to.frame('iframeResult')
source=browser.find_element_by_css_selector("#draggable")
target=browser.find_element_by_css_selector("#droppable")
actions=ActionChains(browser)
actions.drag_and_drop(source,target)
actions.perform()
下载安装selenium和chromedriver。以上代码完成了一个图片拖动的验证过程。其中看browser.switch_to.frame()方法时有点困难。查阅资料https://huilansame.github.io/huilansame.github.io/archivers/switch-to-frame
明白了这个方法是定位到页面元素,传入字符串就按id或name切换到frame中。传入数字就按0序切换到第一个frame中。
老实了解了灰度化的原理:
灰度数字图像是每个像素只有一个采样颜色的图像。这类图像通常显示为从最暗黑色到最亮的白色的灰度。
可以通过下面几种方法,将图像转换为灰度:
- 1.浮点算法:Gray=R*0.3+G*0.59+B*0.11
- 2.整数方法:Gray=(R*30+G*59+B*11)/100
- 3.移位方法:Gray =(R*76+G*151+B*28)>>8;
- 4.平均值法:Gray=(R+G+B)/3;
- 5.仅取绿色:Gray=G;
通过上述任一种方法求得Gray后,将原来的RGB(R,G,B)中的R,G,B统一用Gray替换,形成新的颜色RGB(Gray,Gray,Gray),用它替换原来的RGB(R,G,B)就是灰度图了。即是,R,G,B三色值相等成为灰度图。
学了验证码的识别,想拿自己学校的教务系统试试手。本以为学校的教务系统做的很差,但是验证码还是让我头痛了一阵,按普通的灰度化,然后二值化后,得到的图片噪点居然还是有很多,使用tesserocr还是很难识别,仔细分析了一下图片,发现验证码里的字符的RGB颜色很有规律。R=0,G=0,B=153。
从上到下三张图分别是:官网验证码、灰度二值化、针对图片特点根据R,G,B值提取。
![]()
![]()
![]()
明显第三张有很好的效果。但是tesserocr的识别率还是很低,不过好在已经有了良好的数据。下一步可以考虑用深度学习算法进行训练建模来识别这类验证码。
对图像的处理代码如下:
import tesserocr
from PIL import Image
image = Image.open("check.png")
result = tesserocr.image_to_text(image)
print('之前:' + result)
threshold = 40
def book_clear(image, threshold):
image = image.convert("L")
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
img = image.point(table, "1")
img.save("img1.jpg")
img.show()
result = tesserocr.image_to_text(img)
print('之后:' + result)
book_clear(image, threshold)
img2 = image.convert("RGB")
pix = img2.load()
def rgb_clear():
for y in range(img2.size[1]): # 二值化处理,这个阈值为R=95,G=95,B=95
for x in range(img2.size[0]):
# R = pix[x, y][0]
# G = pix[x, y][1]
# B = pix[x, y][2]
#
# print("this is RGB:R=" + str(R) + "G=" + str(G) + "B=" + str(B))
if pix[x, y][2] == 153 and pix[x, y][1] == 0 and pix[x, y][0] == 0:
#if pix[x, y][2] == 153:
pix[x, y] = (0, 0, 0, 255)
else:
pix[x, y] = (255, 255, 255, 255)
img2.save("img2.jpg")
result = tesserocr.image_to_text(img2)
print(result)
rgb_clear()
爬取SUSE的5000条验证码代码如下(用自动化测试软件selenium完成网页的打开,用户名,密码的填写,下一步搞定验证码就开始爆破):
from selenium import webdriver
from io import BytesIO
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from PIL import Image
TXTUSERNAME = "123"
TEXTBOX2 = "123"
# txtSecretCode = ""
class CrackSUSE():
def __init__(self):
self.url = "http://61.139.105.138/default2.aspx"
self.browser = webdriver.Chrome()
self.wait = WebDriverWait(self.browser, 20)
self.txtUserName = TXTUSERNAME
self.TextBox2 = TEXTBOX2
# def __del__(self):
# self.browser.close()
def get_position(self):
"""
获取验证码位置
:return: 验证码位置元组
"""
img = self.wait.until(EC.presence_of_element_located((By.ID, 'icode')))
time.sleep(2)
location = img.location
size = img.size
top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
'width']
return (top, bottom, left, right)
def get_check_bt(self):
button = self.wait.until(EC.element_to_be_clickable((By.ID, 'icode')))
return button
def get_geetest_image(self, name='check.png',i=0):
"""
获取验证码图片
:return: 图片对象
"""
top, bottom, left, right = self.get_position()
print('验证码位置', top, bottom, left, right)
screenshot = self.get_screenshot()
check = screenshot.crop((left, top, right, bottom))
check.save("E:\check\\"+str(i)+name)
self.get_check_bt().click()
return check
def get_screenshot(self):
"""
获取网页截图
:return: 截图对象
"""
screenshot = self.browser.get_screenshot_as_png()
screenshot = Image.open(BytesIO(screenshot))
return screenshot
def open(self):
"""
打开网页输入用户名密码
:return: None
"""
self.browser.get(self.url)
txtUserName = self.wait.until(EC.presence_of_element_located((By.ID, 'txtUserName')))
TextBox2 = self.wait.until(EC.presence_of_element_located((By.ID, 'TextBox2')))
txtUserName.send_keys(self.txtUserName)
TextBox2.send_keys(self.TextBox2)
def crack(self):
# 输入用户名密码
self.open()
self.mloop()
def mloop(self):
for i in range(5000):
self.get_geetest_image("check.png",i)
i
if __name__ == '__main__':
crack = CrackSUSE()
crack.crack()
这里用了自己刚学的selenium自动化软件,打开网页,填充用户名,密码,获取验证码,获取验证码的方式是截取屏幕特定位置的图像,保存为图片。
tesserocr原有的识别度并不能在灰度化、二值化、八邻域降噪后得到可以接受的效果,所以需要自己训练自己tesserocr。参考网上的方法,在准备数据中遇到了一个特别恼人的问题,就是在我灰度化+二值化后返回的image如果不存如磁盘而直接提供给八邻域降噪的话,居然是无效的,我想因该是在二值化save()那有问题,如果不save那么返回的image可能是没二值化的image。最后自己验证,发现居然打开的是原图,灰度化都不算。可怕,浪费时间。
批量降噪和得到较为干净的数据:
# 批量处理已下载的图片,进行灰度化和二值化并存入E:\after_gray,然后再取出前者,进行降噪再存入E:\after_noise
from PIL import Image
import tesserocr
get_path = "E:\\check\\"
save_path = "E:\\after_gray\\"
save_ending = "E:\\after_noise\\"
name = "check.png"
def book_clear(image, threshold, save_info):
image = image.convert("L")
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
img = image.point(table, "1")
img.save(save_info)
return img
def depoint(image1, saveinfo):
"""传入二值化后的图片进行降噪"""
pixdata = image1.load()
w, h = image1.size
print(w, h)
for y in range(1, h - 1):
for x in range(1, w - 1):
count = 0
if pixdata[x, y - 1] > 245:
count = count + 1
if pixdata[x, y + 1] > 245:
count = count + 1
if pixdata[x - 1, y] > 245:
count = count + 1
if pixdata[x + 1, y] > 245:
count = count + 1
if pixdata[x - 1, y - 1] > 245:
count = count + 1
if pixdata[x - 1, y + 1] > 245:
count = count + 1
if pixdata[x + 1, y - 1] > 245:
count = count + 1
if pixdata[x + 1, y + 1] > 245:
count = count + 1
if count > 6: # 控制领域判定大小
pixdata[x, y] = 255
image1.save(saveinfo)
def getImage(imgpath):
image = Image.open(imgpath)
return image
def mloop(number):
for i in range(number):
image = getImage(get_path + str(i) + name)
save_info1 = save_path + str(i) + name
save_info2 = save_ending + str(i) + name
book_clear(image, 60, save_info1)
img = Image.open(save_info1)
depoint(img, save_info2)
if __name__ == '__main__':
mloop(2000)
关于CNN的学习还有一段路,所以还是采用了书本上介绍的打码平台chaojiying用了他们的借口完成了自动化登录程序。如下代码:
from selenium import webdriver
from io import BytesIO
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from PIL import Image
import requests
from hashlib import md5
TXTUSERNAME = "自己的账号"
TEXTBOX2 = "自己的密码!"
# txtSecretCode = ""
class CrackSUSE():
def __init__(self):
self.url = "http://61.139.105.138/default2.aspx"
self.browser = webdriver.Chrome()
self.wait = WebDriverWait(self.browser, 20)
self.txtUserName = TXTUSERNAME
self.TextBox2 = TEXTBOX2
# def __del__(self):
# self.browser.close()
def get_position(self):
img = self.wait.until(EC.presence_of_element_located((By.ID, 'icode')))
# time.sleep(2)
location = img.location
size = img.size
top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
'width']
return (top, bottom, left, right)
def get_check_bt(self):
button = self.wait.until(EC.element_to_be_clickable((By.ID, 'Button1')))
return button
def get_geetest_image(self, name='check.png'):
top, bottom, left, right = self.get_position()
print('验证码位置', top, bottom, left, right)
screenshot = self.get_screenshot()
check = screenshot.crop((left, top, right, bottom))
# check.save("E:\\ocr\\ven\\picture\\" + name)
return check
def get_screenshot(self):
screenshot = self.browser.get_screenshot_as_png()
screenshot = Image.open(BytesIO(screenshot))
return screenshot
def open(self):
self.browser.get(self.url)
txtUserName = self.wait.until(EC.presence_of_element_located((By.ID, 'txtUserName')))
TextBox2 = self.wait.until(EC.presence_of_element_located((By.ID, 'TextBox2')))
txtUserName.send_keys(self.txtUserName)
TextBox2.send_keys(self.TextBox2)
def crack(self, cjy):
# 输入用户名密码
self.open()
image = self.get_geetest_image("check.png")
bytes_array = BytesIO()
image.save(bytes_array, format="PNG")
result = cjy.post_pic(bytes_array.getvalue(), 1902)
print(result)
value = result["pic_str"]
txtSecretCode = self.wait.until(EC.presence_of_element_located((By.ID, 'txtSecretCode')))
txtSecretCode.send_keys(value)
class Chaojiying(object):
def __init__(self, username, password, soft_id):
self.username = username
self.password = md5(password.encode('utf-8')).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def post_pic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
headers=self.headers)
return r.json()
def report_error(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
if __name__ == '__main__':
crack = CrackSUSE()
cjy = Chaojiying("rejae8y", "2778142138", 123456789)
answer = crack.crack(cjy)
crack.get_check_bt().click()

浙公网安备 33010602011771号