深度学习的验证码识别完整实现

下面我将提供一个完整的验证码识别系统实现,包含数据生成、模型构建、训练和预测全流程代码。本方案使用CNN+BiLSTM+CTC架构,能够处理不定长度的验证码识别。

  1. 环境准备
    python

所需库

!pip install tensorflow==2.8.0 opencv-python pillow numpy matplotlib captcha
2. 验证码数据集生成
由于真实验证码数据获取困难,我们先使用Python生成模拟验证码:

python
from captcha.image import ImageCaptcha
import random
import string
import os
更多内容访问ttocr.com或联系1436423940

字符集定义

CHAR_SET = string.digits + string.ascii_uppercase # 数字+大写字母
CHAR_SET_LEN = len(CHAR_SET)
CAPTCHA_LEN = 6 # 验证码长度

图片尺寸

IMAGE_HEIGHT = 60
IMAGE_WIDTH = 160

生成验证码图片

def generate_captcha(captcha_text=None):
if captcha_text is None:
captcha_text = ''.join(random.sample(CHAR_SET, CAPTCHA_LEN))

image = ImageCaptcha(width=IMAGE_WIDTH, height=IMAGE_HEIGHT)
captcha = image.generate(captcha_text)
return captcha_text, captcha

生成数据集

def generate_dataset(size=10000, output_dir='./captchas'):
if not os.path.exists(output_dir):
os.makedirs(output_dir)

labels = []
for i in range(size):
    text, image = generate_captcha()
    filename = os.path.join(output_dir, f"{i}_{text}.png")
    with open(filename, 'wb') as f:
        f.write(image.getvalue())
    labels.append(f"{filename}\t{text}\n")

# 保存标签文件
with open(os.path.join(output_dir, 'labels.txt'), 'w') as f:
    f.writelines(labels)

生成1万张验证码图片

generate_dataset(10000)
3. 数据预处理与加载
python
import cv2
import numpy as np
from tensorflow.keras.utils import to_categorical

字符到数字的映射

char_to_num = {c: i for i, c in enumerate(CHAR_SET)}
num_to_char = {i: c for i, c in enumerate(CHAR_SET)}

def encode_label(text):
return [char_to_num[c] for c in text]

def decode_label(nums):
return ''.join([num_to_char[n] for n in nums])

def preprocess_image(image_path, img_width=IMAGE_WIDTH, img_height=IMAGE_HEIGHT):
# 读取图像并转为灰度
img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

# 二值化
_, img = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY_INV)

# 调整尺寸
img = cv2.resize(img, (img_width, img_height))

# 归一化
img = img.astype(np.float32) / 255.0

# 增加通道维度
img = np.expand_dims(img, axis=-1)

return img

数据生成器

class DataGenerator:
def init(self, labels_file, batch_size=32):
with open(labels_file, 'r') as f:
self.samples = f.readlines()
self.batch_size = batch_size
self.indices = np.arange(len(self.samples))
np.random.shuffle(self.indices)

def __len__(self):
    return len(self.samples) // self.batch_size

def __getitem__(self, idx):
    batch_indices = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
    batch_samples = [self.samples[i] for i in batch_indices]
    
    images = []
    labels = []
    label_lengths = []
    
    for sample in batch_samples:
        image_path, text = sample.strip().split('\t')
        
        # 预处理图像
        img = preprocess_image(image_path)
        images.append(img)
        
        # 编码标签
        encoded = encode_label(text)
        labels.append(encoded)
        label_lengths.append(len(text))
    
    # 转换为numpy数组
    images = np.array(images)
    labels = np.array(labels)
    label_lengths = np.array(label_lengths)
    
    # 对于CTC损失,需要输入长度(这里简化为图像宽度//4)
    input_length = np.ones((self.batch_size, 1)) * (IMAGE_WIDTH // 4 - 2)
    
    # CTC需要的输出格式
    outputs = {
        'ctc': np.zeros((self.batch_size))  # 伪标签,实际不使用
    }
    
    return {
        'input': images,
        'labels': labels,
        'input_length': input_length,
        'label_length': label_lengths
    }, outputs

def on_epoch_end(self):
    np.random.shuffle(self.indices)
  1. 模型构建(CRNN架构)
    python
    from tensorflow.keras.models import Model
    from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Reshape, Bidirectional, LSTM, Dense, Lambda
    import tensorflow as tf

def build_crnn_model():
# 输入层
input_img = Input(shape=(IMAGE_HEIGHT, IMAGE_WIDTH, 1), name='input')

# CNN部分
x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_img)
x = MaxPooling2D((2, 2))(x)

x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2))(x)

x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2))(x)

# 准备RNN输入
x = Reshape(((IMAGE_WIDTH // 8), (IMAGE_HEIGHT // 8) * 128))(x)

# RNN部分
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Bidirectional(LSTM(128, return_sequences=True))(x)

# 输出层
x = Dense(CHAR_SET_LEN + 1, activation='softmax')(x)  # +1 for CTC blank

# 定义模型
model = Model(inputs=input_img, outputs=x)

return model

CTC损失函数

def ctc_loss(y_true, y_pred):
batch_size = tf.shape(y_pred)[0]
input_length = tf.ones(batch_size) * tf.cast(tf.shape(y_pred)[1], tf.float32)
label_length = tf.ones(batch_size) * tf.cast(tf.shape(y_true)[1], tf.float32)

return tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)

编译模型

model = build_crnn_model()
model.compile(optimizer='adam', loss=ctc_loss)
5. 模型训练
python
from sklearn.model_selection import train_test_split

划分训练集和验证集

with open('./captchas/labels.txt', 'r') as f:
samples = f.readlines()

train_samples, val_samples = train_test_split(samples, test_size=0.2, random_state=42)

保存划分结果

with open('./captchas/train_labels.txt', 'w') as f:
f.writelines(train_samples)
with open('./captchas/val_labels.txt', 'w') as f:
f.writelines(val_samples)

创建数据生成器

train_gen = DataGenerator('./captchas/train_labels.txt', batch_size=32)
val_gen = DataGenerator('./captchas/val_labels.txt', batch_size=32)

自定义回调函数用于解码预测

class DecodeCallback(tf.keras.callbacks.Callback):
def init(self, val_gen):
super().init()
self.val_gen = val_gen

def on_epoch_end(self, epoch, logs=None):
    # 随机选择一批验证数据
    data, _ = self.val_gen[0]
    images = data['input']
    true_texts = [decode_label(l) for l in data['labels']]
    
    # 预测
    preds = self.model.predict(images)
    pred_texts = decode_predictions(preds)
    
    # 打印一些示例
    print("\n验证集示例:")
    for i in range(5):
        print(f"真实: {true_texts[i]} \t 预测: {pred_texts[i]}")
    
    # 计算本批次的准确率
    correct = sum(1 for t, p in zip(true_texts, pred_texts) if t == p)
    accuracy = correct / len(true_texts)
    print(f"本批次验证准确率: {accuracy:.2%}\n")

解码预测结果

def decode_predictions(preds):
input_len = np.ones(preds.shape[0]) * preds.shape[1]

# 使用贪婪搜索
results = tf.keras.backend.ctc_decode(preds, input_length=input_len, greedy=True)[0][0]

# 转换为文本
texts = []
for res in results:
    res = [r for r in res if r != -1]  # 移除填充值
    texts.append(decode_label(res))

return texts

训练模型

history = model.fit(
train_gen,
validation_data=val_gen,
epochs=30,
callbacks=[
tf.keras.callbacks.ModelCheckpoint('crnn_best.h5', save_best_only=True),
tf.keras.callbacks.EarlyStopping(patience=5),
DecodeCallback(val_gen)
]
)
6. 模型评估与预测
python
import matplotlib.pyplot as plt

绘制训练曲线

def plot_history(history):
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='训练损失')
plt.plot(history.history['val_loss'], label='验证损失')
plt.title('损失曲线')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='训练准确率')
plt.plot(history.history['val_accuracy'], label='验证准确率')
plt.title('准确率曲线')
plt.legend()

plt.show()

plot_history(history)

测试模型

def test_model(model, test_dir='./test_captchas', num_samples=20):
if not os.path.exists(test_dir):
os.makedirs(test_dir)
print(f"请在 {test_dir} 目录下放置测试验证码图片")
return

test_files = [f for f in os.listdir(test_dir) if f.endswith('.png')][:num_samples]

correct = 0
for filename in test_files:
    image_path = os.path.join(test_dir, filename)
    true_text = filename.split('_')[1].split('.')[0]
    
    # 预处理图像
    img = preprocess_image(image_path)
    img = np.expand_dims(img, axis=0)
    
    # 预测
    pred = model.predict(img)
    pred_text = decode_predictions(pred)[0]
    
    # 显示结果
    print(f"文件名: {filename}")
    print(f"真实: {true_text} \t 预测: {pred_text}")
    
    if true_text == pred_text:
        correct += 1

accuracy = correct / len(test_files)
print(f"\n测试准确率: {accuracy:.2%}")

加载最佳模型

best_model = tf.keras.models.load_model('crnn_best.h5', custom_objects={'ctc_loss': ctc_loss})

测试模型

test_model(best_model)
7. 完整预测API
python
from PIL import Image
import io

class CaptchaRecognizer:
def init(self, model_path='crnn_best.h5'):
self.model = tf.keras.models.load_model(model_path, custom_objects={'ctc_loss': ctc_loss})
self.char_set = CHAR_SET
self.char_to_num = char_to_num
self.num_to_char = num_to_char

def preprocess(self, image):
    if isinstance(image, str):  # 文件路径
        img = cv2.imread(image, cv2.IMREAD_GRAYSCALE)
    elif isinstance(image, bytes):  # 字节流
        img = np.array(Image.open(io.BytesIO(image)).convert('L'))
    else:  # 假定已经是numpy数组
        img = image
    
    # 二值化
    _, img = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY_INV)
    
    # 调整尺寸
    img = cv2.resize(img, (IMAGE_WIDTH, IMAGE_HEIGHT))
    
    # 归一化并增加维度
    img = img.astype(np.float32) / 255.0
    img = np.expand_dims(img, axis=-1)
    img = np.expand_dims(img, axis=0)
    
    return img

def decode_predictions(self, preds):
    input_len = np.ones(preds.shape[0]) * preds.shape[1]
    results = tf.keras.backend.ctc_decode(preds, input_length=input_len, greedy=True)[0][0]
    texts = []
    for res in results:
        res = [r for r in res if r != -1]
        texts.append(''.join([self.num_to_char.get(int(r), '') for r in res]))
    return texts

def predict(self, image):
    # 预处理
    processed = self.preprocess(image)
    
    # 预测
    pred = self.model.predict(processed)
    
    # 解码
    text = self.decode_predictions(pred)[0]
    
    return text

使用示例

recognizer = CaptchaRecognizer()

从文件预测

result = recognizer.predict('test.png')
print(f"预测结果: {result}")

从字节流预测

with open('test.png', 'rb') as f:
image_bytes = f.read()
result = recognizer.predict(image_bytes)
print(f"预测结果: {result}")
8. 模型优化建议
数据增强:增加训练数据的多样性

python
from imgaug import augmenters as iaa

augmenter = iaa.Sequential([
iaa.GaussianBlur(sigma=(0, 1.0)),
iaa.Affine(
scale={"x": (0.9, 1.1), "y": (0.9, 1.1)},
translate_percent={"x": (-0.1, 0.1), "y": (-0.1, 0.1)},
rotate=(-10, 10)
),
iaa.AdditiveGaussianNoise(scale=(0, 0.05*255))
])

posted @ 2025-05-21 13:20  ttocr、com  阅读(31)  评论(0)    收藏  举报