深度学习验证码识别实战:从零构建端到端识别系统

  1. 项目概述与准备工作
    本教程将带你完整实现一个基于深度学习的验证码识别系统,使用PyTorch框架从零开始构建。我们会涵盖数据生成、模型设计、训练优化和部署应用全流程。

1.1 环境配置
首先确保安装以下Python库:

bash
pip install torch torchvision pillow matplotlib numpy captcha flask
1.2 验证码生成器
我们首先生成训练数据。创建一个captcha_generator.py:

python
from captcha.image import ImageCaptcha
import random
import string
import os
网站地址www.tmocr.com或联系q1092685548

配置参数

CHAR_SET = string.digits + string.ascii_uppercase # 使用数字和大写字母
CAPTCHA_LENGTH = 5 # 验证码长度
IMAGE_WIDTH, IMAGE_HEIGHT = 180, 60 # 图片尺寸
DATA_DIR = "captcha_data" # 数据存储目录
TRAIN_RATIO = 0.8 # 训练集比例

def generate_captcha_dataset(num_samples=20000):
if not os.path.exists(DATA_DIR):
os.makedirs(DATA_DIR)
os.makedirs(os.path.join(DATA_DIR, "train"))
os.makedirs(os.path.join(DATA_DIR, "test"))

generator = ImageCaptcha(width=IMAGE_WIDTH, height=IMAGE_HEIGHT)

for i in range(num_samples):
    captcha_text = ''.join(random.choices(CHAR_SET, k=CAPTCHA_LENGTH))
    image = generator.generate_image(captcha_text)
    
    # 按比例分配到训练集和测试集
    if random.random() < TRAIN_RATIO:
        save_path = os.path.join(DATA_DIR, "train", f"{captcha_text}_{i}.png")
    else:
        save_path = os.path.join(DATA_DIR, "test", f"{captcha_text}_{i}.png")
    
    image.save(save_path)
    if (i+1) % 1000 == 0:
        print(f"已生成 {i+1} 张验证码")

if name == "main":
generate_captcha_dataset(20000)
print("验证码数据集生成完成!")
运行此脚本将生成2万张验证码图片,并按8:2的比例分为训练集和测试集。

  1. 数据加载与预处理
    创建data_loader.py处理数据:

python
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
import numpy as np
from torchvision import transforms

class CaptchaDataset(Dataset):
def init(self, data_dir, char_set, transform=None):
self.data_dir = data_dir
self.image_files = [f for f in os.listdir(data_dir) if f.endswith('.png')]
self.transform = transform
self.char_to_idx = {c: i for i, c in enumerate(char_set)}
self.idx_to_char = {i: c for i, c in enumerate(char_set)}

def __len__(self):
    return len(self.image_files)

def __getitem__(self, idx):
    img_path = os.path.join(self.data_dir, self.image_files[idx])
    image = Image.open(img_path).convert('RGB')
    label_str = self.image_files[idx].split('_')[0]
    
    # 将标签转为数字索引
    label = [self.char_to_idx[c] for c in label_str]
    
    if self.transform:
        image = self.transform(image)
        
    return image, torch.tensor(label, dtype=torch.long), label_str

def get_data_loaders(batch_size=64):
# 数据增强
transform = transforms.Compose([
transforms.ColorJitter(brightness=0.2, contrast=0.2),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# 加载数据集
train_dataset = CaptchaDataset(
    os.path.join("captcha_data", "train"),
    CHAR_SET,
    transform
)

test_dataset = CaptchaDataset(
    os.path.join("captcha_data", "test"), 
    CHAR_SET,
    transform
)

# 创建DataLoader
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=4,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

return train_loader, test_loader
  1. 模型构建
    创建model.py定义我们的深度学习模型:

python
import torch.nn as nn
import torch.nn.functional as F

class CaptchaModel(nn.Module):
def init(self, num_chars):
super(CaptchaModel, self).init()

    # CNN特征提取部分
    self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
    self.bn1 = nn.BatchNorm2d(32)
    self.pool1 = nn.MaxPool2d(2)
    
    self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
    self.bn2 = nn.BatchNorm2d(64)
    self.pool2 = nn.MaxPool2d(2)
    
    self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
    self.bn3 = nn.BatchNorm2d(128)
    self.pool3 = nn.MaxPool2d(2)
    
    # 计算卷积后的特征图尺寸
    self.conv_output_size = self._get_conv_output_size()
    
    # RNN序列建模部分
    self.gru = nn.GRU(
        input_size=self.conv_output_size,
        hidden_size=128,
        num_layers=2,
        bidirectional=True,
        dropout=0.2
    )
    
    # 输出层
    self.fc = nn.Linear(256, num_chars)
    
def _get_conv_output_size(self):
    # 模拟计算卷积后的特征图尺寸
    with torch.no_grad():
        dummy_input = torch.zeros(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH)
        output = self.pool1(self.bn1(F.relu(self.conv1(dummy_input))))
        output = self.pool2(self.bn2(F.relu(self.conv2(output)))))
        output = self.pool3(self.bn3(F.relu(self.conv3(output)))))
        return output.size(1) * output.size(2)

def forward(self, x):
    # CNN部分
    x = self.pool1(F.relu(self.bn1(self.conv1(x))))
    x = self.pool2(F.relu(self.bn2(self.conv2(x)))))
    x = self.pool3(F.relu(self.bn3(self.conv3(x)))))
    
    # 调整维度适应RNN
    batch_size = x.size(0)
    x = x.permute(0, 3, 1, 2)  # [B, W, C, H]
    x = x.reshape(batch_size, -1, self.conv_output_size)
    
    # RNN部分
    x, _ = self.gru(x)
    
    # 输出层
    x = self.fc(x)
    x = x.permute(1, 0, 2)  # CTC Loss需要的维度 [T, B, C]
    
    return x
  1. 训练与验证
    创建train.py实现训练流程:

python
import torch
from torch import nn
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from data_loader import get_data_loaders
from model import CaptchaModel
import time
import os

训练参数

BATCH_SIZE = 64
EPOCHS = 30
LEARNING_RATE = 0.001
GRAD_CLIP = 5
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MODEL_SAVE_PATH = "captcha_model.pth"

初始化

model = CaptchaModel(len(CHAR_SET)).to(DEVICE)
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CTCLoss(blank=len(CHAR_SET)) # 使用CTC Loss处理不定长序列
train_loader, test_loader = get_data_loaders(BATCH_SIZE)

def train():
model.train()
total_loss = 0
correct = 0
total = 0

for batch_idx, (images, targets, target_strs) in enumerate(train_loader):
    images = images.to(DEVICE)
    targets = targets.to(DEVICE)
    
    optimizer.zero_grad()
    outputs = model(images)
    
    # 准备CTC Loss需要的输入
    input_lengths = torch.full(
        size=(outputs.size(1),), 
        fill_value=outputs.size(0),
        dtype=torch.long
    )
    
    target_lengths = torch.full(
        size=(targets.size(0),),
        fill_value=targets.size(1),
        dtype=torch.long
    )
    
    loss = criterion(outputs, targets, input_lengths, target_lengths)
    loss.backward()
    clip_grad_norm_(model.parameters(), GRAD_CLIP)
    optimizer.step()
    
    total_loss += loss.item()
    
    # 计算准确率
    _, predicted = torch.max(outputs.permute(1, 0, 2), 2)
    predicted = predicted.transpose(0, 1).contiguous().view(-1, CAPTCHA_LENGTH)
    
    for i in range(len(target_strs)):
        pred_str = ''.join([IDX_TO_CHAR[p] for p in predicted[i]])
        if pred_str == target_strs[i]:
            correct += 1
        total += 1
    
    if (batch_idx+1) % 100 == 0:
        print(f"Batch {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}")

avg_loss = total_loss / len(train_loader)
accuracy = correct / total
return avg_loss, accuracy

def evaluate():
model.eval()
total_loss = 0
correct = 0
total = 0

with torch.no_grad():
    for images, targets, target_strs in test_loader:
        images = images.to(DEVICE)
        targets = targets.to(DEVICE)
        
        outputs = model(images)
        
        # 准备CTC Loss需要的输入
        input_lengths = torch.full(
            size=(outputs.size(1),), 
            fill_value=outputs.size(0),
            dtype=torch.long
        )
        
        target_lengths = torch.full(
            size=(targets.size(0),),
            fill_value=targets.size(1),
            dtype=torch.long
        )
        
        loss = criterion(outputs, targets, input_lengths, target_lengths)
        total_loss += loss.item()
        
        # 计算准确率
        _, predicted = torch.max(outputs.permute(1, 0, 2), 2)
        predicted = predicted.transpose(0, 1).contiguous().view(-1, CAPTCHA_LENGTH)
        
        for i in range(len(target_strs)):
            pred_str = ''.join([IDX_TO_CHAR[p] for p in predicted[i]])
            if pred_str == target_strs[i]:
                correct += 1
            total += 1

avg_loss = total_loss / len(test_loader)
accuracy = correct / total
return avg_loss, accuracy

def main():
best_accuracy = 0

for epoch in range(1, EPOCHS+1):
    start_time = time.time()
    train_loss, train_acc = train()
    val_loss, val_acc = evaluate()
    end_time = time.time()
    
    print(f"Epoch {epoch}/{EPOCHS} | Time: {end_time-start_time:.2f}s")
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
    
    # 保存最佳模型
    if val_acc > best_accuracy:
        best_accuracy = val_acc
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"New best model saved with accuracy {val_acc:.4f}")

if name == "main":
main()
5. 模型部署与应用
创建app.py实现一个简单的Web服务:

python
from flask import Flask, request, jsonify
from PIL import Image
import io
import torch
from model import CaptchaModel
from torchvision import transforms

app = Flask(name)

加载模型

model = CaptchaModel(len(CHAR_SET))
model.load_state_dict(torch.load("captcha_model.pth", map_location='cpu'))
model.eval()

图像预处理

transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

@app.route('/predict', methods=['POST'])
def predict():
if 'file' not in request.files:
return jsonify({'error': 'No file uploaded'}), 400

file = request.files['file']
if file.filename == '':
    return jsonify({'error': 'Empty filename'}), 400

try:
    # 读取并预处理图像
    image = Image.open(io.BytesIO(file.read())).convert('RGB')
    image = transform(image).unsqueeze(0)
    
    # 预测
    with torch.no_grad():
        output = model(image)
        _, predicted = torch.max(output.permute(1, 0, 2), 2)
        predicted = predicted.transpose(0, 1).contiguous().view(-1, CAPTCHA_LENGTH)
        captcha_text = ''.join([IDX_TO_CHAR[p] for p in predicted[0]])
    
    return jsonify({'prediction': captcha_text})

except Exception as e:
    return jsonify({'error': str(e)}), 500

if name == 'main':
app.run(host='0.0.0.0', port=5000)
6. 项目扩展与优化
6.1 数据增强改进
在data_loader.py中增强数据多样性:

python
transform = transforms.Compose([
transforms.RandomRotation(10),
transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
transforms.GaussianBlur(kernel_size=(3, 3), sigma=(0.1, 2.0)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
6.2 模型架构优化
尝试更先进的模型结构:

python
class ImprovedCaptchaModel(nn.Module):
def init(self, num_chars):
super().init()

    # 使用ResNet风格的残差块
    self.conv_block1 = nn.Sequential(
        nn.Conv2d(3, 32, 3, padding=1),
        nn.BatchNorm2d(32),
        nn.ReLU(),
        nn.Conv2d(32, 32, 3, padding=1),
        nn.BatchNorm2d(32),
        nn.ReLU(),
        nn.MaxPool2d(2)
    )
    
    self.conv_block2 = nn.Sequential(
        nn.Conv2d(32, 64, 3, padding=1),
        nn.BatchNorm2d(64),
        nn.ReLU(),
        nn.Conv2d(64, 64, 3, padding=1),
        nn.BatchNorm2d(64),
        nn.ReLU(),
        nn.MaxPool2d(2)
    )
    
    # 加入注意力机制
    self.attention = nn.Sequential(
        nn.Conv2d(64, 1, 1),
        nn.Sigmoid()
    )
    
    # 使用BiGRU替代标准GRU
    self.gru = nn.GRU(
        input_size=64*(IMAGE_HEIGHT//4)*(IMAGE_WIDTH//4),
        hidden_size=256,
        num_layers=3,
        bidirectional=True,
        dropout=0.3
    )
    
    self.fc = nn.Linear(512, num_chars)

6.3 部署优化
使用ONNX加速推理:

python

导出模型为ONNX格式

dummy_input = torch.randn(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH)
torch.onnx.export(
model,
dummy_input,
"captcha_model.onnx",
input_names=["input"],
output_names=["output"],
dynamic_axes={
"input": {0: "batch_size"},
"output": {1: "batch_size"}
}
)

使用ONNX Runtime进行推理

import onnxruntime as ort

ort_session = ort.InferenceSession("captcha_model.onnx")

def predict_onnx(image):
input_name = ort_session.get_inputs()[0].name
output_name = ort_session.get_outputs()[0].name

# 预处理图像
image = transform(image).unsqueeze(0).numpy()

# 运行推理
outputs = ort_session.run([output_name], {input_name: image})
outputs = torch.tensor(outputs[0])

# 解码预测结果
_, predicted = torch.max(outputs, 2)
predicted = predicted.squeeze(1)
captcha_text = ''.join([IDX_TO_CHAR[p] for p in predicted])

return captcha_text
posted @ 2025-05-10 21:57  tmcor  阅读(118)  评论(0)    收藏  举报