基于 PyTorch 的端到端验证码识别系统设计与实现

一、背景简介
传统验证码识别方法依赖图像预处理和字符切割,对抗干扰能力差。为此,近年来更先进的端到端识别方法被广泛采用,结合卷积神经网络(CNN)与循环神经网络(RNN),无需切割即可识别整个验证码序列。本文将介绍一个基于 PyTorch 框架构建的端到端验证码识别模型。

二、验证码特性说明
验证码图像尺寸统一为 160x60,内容为长度固定的4位或5位英数字混合字符串,字体扭曲、存在背景噪声和干扰线,适合使用 CNN + CTC 的结构识别。

三、环境与依赖
更多内容访问ttocr.com或联系1436423940
pip install torch torchvision matplotlib numpy opencv-python
四、数据准备与生成(captcha_dataset.py)
我们使用 captcha 库生成数据:

from captcha.image import ImageCaptcha
import string, random, os
from PIL import Image
import torch
from torch.utils.data import Dataset

ALL_CHARS = string.digits + string.ascii_uppercase
CHAR2IDX = {c: i for i, c in enumerate(ALL_CHARS)}
IDX2CHAR = {i: c for c, i in CHAR2IDX.items()}

def generate_code(length=4):
return ''.join(random.choices(ALL_CHARS, k=length))

class CaptchaDataset(Dataset):
def init(self, size=10000, length=4, width=160, height=60):
self.data = []
self.labels = []
image_gen = ImageCaptcha(width, height)
for _ in range(size):
text = generate_code(length)
image = image_gen.generate_image(text).convert('L')
image = image.resize((width, height))
self.data.append(torch.tensor(np.array(image) / 255.0).unsqueeze(0).float())
self.labels.append([CHAR2IDX[c] for c in text])

def __getitem__(self, idx):
    return self.data[idx], torch.tensor(self.labels[idx])

def __len__(self):
    return len(self.data)

五、模型设计(captcha_model.py)

import torch.nn as nn

class CaptchaModel(nn.Module):
def init(self, num_classes):
super().init()
self.cnn = nn.Sequential(
nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
nn.Conv2d(64, 128, 3, padding=1), nn.ReLU()
)
self.rnn = nn.LSTM(128 * 15, 128, num_layers=2, bidirectional=True, batch_first=True)
self.classifier = nn.Linear(256, num_classes + 1) # +1 for CTC blank

def forward(self, x):
    out = self.cnn(x)  # [B, 128, H/4, W/4] → [B, 128, 15, 40]
    b, c, h, w = out.size()
    out = out.permute(0, 3, 1, 2).contiguous().view(b, w, -1)  # [B, W, C×H]
    out, _ = self.rnn(out)  # [B, T, 256]
    out = self.classifier(out)  # [B, T, C]
    return out.log_softmax(2)

六、训练流程(train.py)

from captcha_dataset import CaptchaDataset, CHAR2IDX, IDX2CHAR
from captcha_model import CaptchaModel
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam

def train():
dataset = CaptchaDataset()
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

model = CaptchaModel(num_classes=len(CHAR2IDX))
criterion = nn.CTCLoss(blank=len(CHAR2IDX))
optimizer = Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    for images, labels in dataloader:
        preds = model(images)  # [B, T, C]
        preds = preds.permute(1, 0, 2)  # CTC expects [T, B, C]

        input_lengths = torch.full((images.size(0),), preds.size(0), dtype=torch.long)
        target_lengths = torch.full((images.size(0),), labels.size(1), dtype=torch.long)

        loss = criterion(preds, labels, input_lengths, target_lengths)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

torch.save(model.state_dict(), 'captcha_model.pth')

if name == "main":
train()
七、预测与解码

def decode(preds):
preds = preds.argmax(2).squeeze(0).tolist()
result = []
prev = -1
for p in preds:
if p != prev and p != len(CHAR2IDX): # Skip CTC blank
result.append(IDX2CHAR[p])
prev = p
return ''.join(result)

posted @ 2025-07-29 10:46  ttocr、com  阅读(5)  评论(0)    收藏  举报