深度学习的验证码识别完整实现
下面我将提供一个完整的验证码识别系统实现,包含数据生成、模型构建、训练和预测全流程代码。本方案使用CNN+BiLSTM+CTC架构,能够处理不定长度的验证码识别。
- 环境准备
python
所需库
!pip install tensorflow==2.8.0 opencv-python pillow numpy matplotlib captcha
2. 验证码数据集生成
由于真实验证码数据获取困难,我们先使用Python生成模拟验证码:
python
from captcha.image import ImageCaptcha
import random
import string
import os
更多内容访问ttocr.com或联系1436423940
字符集定义
CHAR_SET = string.digits + string.ascii_uppercase # 数字+大写字母
CHAR_SET_LEN = len(CHAR_SET)
CAPTCHA_LEN = 6 # 验证码长度
图片尺寸
IMAGE_HEIGHT = 60
IMAGE_WIDTH = 160
生成验证码图片
def generate_captcha(captcha_text=None):
if captcha_text is None:
captcha_text = ''.join(random.sample(CHAR_SET, CAPTCHA_LEN))
image = ImageCaptcha(width=IMAGE_WIDTH, height=IMAGE_HEIGHT)
captcha = image.generate(captcha_text)
return captcha_text, captcha
生成数据集
def generate_dataset(size=10000, output_dir='./captchas'):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
labels = []
for i in range(size):
text, image = generate_captcha()
filename = os.path.join(output_dir, f"{i}_{text}.png")
with open(filename, 'wb') as f:
f.write(image.getvalue())
labels.append(f"{filename}\t{text}\n")
# 保存标签文件
with open(os.path.join(output_dir, 'labels.txt'), 'w') as f:
f.writelines(labels)
生成1万张验证码图片
generate_dataset(10000)
3. 数据预处理与加载
python
import cv2
import numpy as np
from tensorflow.keras.utils import to_categorical
字符到数字的映射
char_to_num = {c: i for i, c in enumerate(CHAR_SET)}
num_to_char = {i: c for i, c in enumerate(CHAR_SET)}
def encode_label(text):
return [char_to_num[c] for c in text]
def decode_label(nums):
return ''.join([num_to_char[n] for n in nums])
def preprocess_image(image_path, img_width=IMAGE_WIDTH, img_height=IMAGE_HEIGHT):
# 读取图像并转为灰度
img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
# 二值化
_, img = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY_INV)
# 调整尺寸
img = cv2.resize(img, (img_width, img_height))
# 归一化
img = img.astype(np.float32) / 255.0
# 增加通道维度
img = np.expand_dims(img, axis=-1)
return img
数据生成器
class DataGenerator:
def init(self, labels_file, batch_size=32):
with open(labels_file, 'r') as f:
self.samples = f.readlines()
self.batch_size = batch_size
self.indices = np.arange(len(self.samples))
np.random.shuffle(self.indices)
def __len__(self):
return len(self.samples) // self.batch_size
def __getitem__(self, idx):
batch_indices = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
batch_samples = [self.samples[i] for i in batch_indices]
images = []
labels = []
label_lengths = []
for sample in batch_samples:
image_path, text = sample.strip().split('\t')
# 预处理图像
img = preprocess_image(image_path)
images.append(img)
# 编码标签
encoded = encode_label(text)
labels.append(encoded)
label_lengths.append(len(text))
# 转换为numpy数组
images = np.array(images)
labels = np.array(labels)
label_lengths = np.array(label_lengths)
# 对于CTC损失,需要输入长度(这里简化为图像宽度//4)
input_length = np.ones((self.batch_size, 1)) * (IMAGE_WIDTH // 4 - 2)
# CTC需要的输出格式
outputs = {
'ctc': np.zeros((self.batch_size)) # 伪标签,实际不使用
}
return {
'input': images,
'labels': labels,
'input_length': input_length,
'label_length': label_lengths
}, outputs
def on_epoch_end(self):
np.random.shuffle(self.indices)
- 模型构建(CRNN架构)
python
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Reshape, Bidirectional, LSTM, Dense, Lambda
import tensorflow as tf
def build_crnn_model():
# 输入层
input_img = Input(shape=(IMAGE_HEIGHT, IMAGE_WIDTH, 1), name='input')
# CNN部分
x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_img)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2))(x)
# 准备RNN输入
x = Reshape(((IMAGE_WIDTH // 8), (IMAGE_HEIGHT // 8) * 128))(x)
# RNN部分
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
# 输出层
x = Dense(CHAR_SET_LEN + 1, activation='softmax')(x) # +1 for CTC blank
# 定义模型
model = Model(inputs=input_img, outputs=x)
return model
CTC损失函数
def ctc_loss(y_true, y_pred):
batch_size = tf.shape(y_pred)[0]
input_length = tf.ones(batch_size) * tf.cast(tf.shape(y_pred)[1], tf.float32)
label_length = tf.ones(batch_size) * tf.cast(tf.shape(y_true)[1], tf.float32)
return tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
编译模型
model = build_crnn_model()
model.compile(optimizer='adam', loss=ctc_loss)
5. 模型训练
python
from sklearn.model_selection import train_test_split
划分训练集和验证集
with open('./captchas/labels.txt', 'r') as f:
samples = f.readlines()
train_samples, val_samples = train_test_split(samples, test_size=0.2, random_state=42)
保存划分结果
with open('./captchas/train_labels.txt', 'w') as f:
f.writelines(train_samples)
with open('./captchas/val_labels.txt', 'w') as f:
f.writelines(val_samples)
创建数据生成器
train_gen = DataGenerator('./captchas/train_labels.txt', batch_size=32)
val_gen = DataGenerator('./captchas/val_labels.txt', batch_size=32)
自定义回调函数用于解码预测
class DecodeCallback(tf.keras.callbacks.Callback):
def init(self, val_gen):
super().init()
self.val_gen = val_gen
def on_epoch_end(self, epoch, logs=None):
# 随机选择一批验证数据
data, _ = self.val_gen[0]
images = data['input']
true_texts = [decode_label(l) for l in data['labels']]
# 预测
preds = self.model.predict(images)
pred_texts = decode_predictions(preds)
# 打印一些示例
print("\n验证集示例:")
for i in range(5):
print(f"真实: {true_texts[i]} \t 预测: {pred_texts[i]}")
# 计算本批次的准确率
correct = sum(1 for t, p in zip(true_texts, pred_texts) if t == p)
accuracy = correct / len(true_texts)
print(f"本批次验证准确率: {accuracy:.2%}\n")
解码预测结果
def decode_predictions(preds):
input_len = np.ones(preds.shape[0]) * preds.shape[1]
# 使用贪婪搜索
results = tf.keras.backend.ctc_decode(preds, input_length=input_len, greedy=True)[0][0]
# 转换为文本
texts = []
for res in results:
res = [r for r in res if r != -1] # 移除填充值
texts.append(decode_label(res))
return texts
训练模型
history = model.fit(
train_gen,
validation_data=val_gen,
epochs=30,
callbacks=[
tf.keras.callbacks.ModelCheckpoint('crnn_best.h5', save_best_only=True),
tf.keras.callbacks.EarlyStopping(patience=5),
DecodeCallback(val_gen)
]
)
6. 模型评估与预测
python
import matplotlib.pyplot as plt
绘制训练曲线
def plot_history(history):
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='训练损失')
plt.plot(history.history['val_loss'], label='验证损失')
plt.title('损失曲线')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='训练准确率')
plt.plot(history.history['val_accuracy'], label='验证准确率')
plt.title('准确率曲线')
plt.legend()
plt.show()
plot_history(history)
测试模型
def test_model(model, test_dir='./test_captchas', num_samples=20):
if not os.path.exists(test_dir):
os.makedirs(test_dir)
print(f"请在 {test_dir} 目录下放置测试验证码图片")
return
test_files = [f for f in os.listdir(test_dir) if f.endswith('.png')][:num_samples]
correct = 0
for filename in test_files:
image_path = os.path.join(test_dir, filename)
true_text = filename.split('_')[1].split('.')[0]
# 预处理图像
img = preprocess_image(image_path)
img = np.expand_dims(img, axis=0)
# 预测
pred = model.predict(img)
pred_text = decode_predictions(pred)[0]
# 显示结果
print(f"文件名: {filename}")
print(f"真实: {true_text} \t 预测: {pred_text}")
if true_text == pred_text:
correct += 1
accuracy = correct / len(test_files)
print(f"\n测试准确率: {accuracy:.2%}")
加载最佳模型
best_model = tf.keras.models.load_model('crnn_best.h5', custom_objects={'ctc_loss': ctc_loss})
测试模型
test_model(best_model)
7. 完整预测API
python
from PIL import Image
import io
class CaptchaRecognizer:
def init(self, model_path='crnn_best.h5'):
self.model = tf.keras.models.load_model(model_path, custom_objects={'ctc_loss': ctc_loss})
self.char_set = CHAR_SET
self.char_to_num = char_to_num
self.num_to_char = num_to_char
def preprocess(self, image):
if isinstance(image, str): # 文件路径
img = cv2.imread(image, cv2.IMREAD_GRAYSCALE)
elif isinstance(image, bytes): # 字节流
img = np.array(Image.open(io.BytesIO(image)).convert('L'))
else: # 假定已经是numpy数组
img = image
# 二值化
_, img = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY_INV)
# 调整尺寸
img = cv2.resize(img, (IMAGE_WIDTH, IMAGE_HEIGHT))
# 归一化并增加维度
img = img.astype(np.float32) / 255.0
img = np.expand_dims(img, axis=-1)
img = np.expand_dims(img, axis=0)
return img
def decode_predictions(self, preds):
input_len = np.ones(preds.shape[0]) * preds.shape[1]
results = tf.keras.backend.ctc_decode(preds, input_length=input_len, greedy=True)[0][0]
texts = []
for res in results:
res = [r for r in res if r != -1]
texts.append(''.join([self.num_to_char.get(int(r), '') for r in res]))
return texts
def predict(self, image):
# 预处理
processed = self.preprocess(image)
# 预测
pred = self.model.predict(processed)
# 解码
text = self.decode_predictions(pred)[0]
return text
使用示例
recognizer = CaptchaRecognizer()
从文件预测
result = recognizer.predict('test.png')
print(f"预测结果: {result}")
从字节流预测
with open('test.png', 'rb') as f:
image_bytes = f.read()
result = recognizer.predict(image_bytes)
print(f"预测结果: {result}")
8. 模型优化建议
数据增强:增加训练数据的多样性
python
from imgaug import augmenters as iaa
augmenter = iaa.Sequential([
iaa.GaussianBlur(sigma=(0, 1.0)),
iaa.Affine(
scale={"x": (0.9, 1.1), "y": (0.9, 1.1)},
translate_percent={"x": (-0.1, 0.1), "y": (-0.1, 0.1)},
rotate=(-10, 10)
),
iaa.AdditiveGaussianNoise(scale=(0, 0.05*255))
])
浙公网安备 33010602011771号