图像验证码识别:基于 PyTorch 的完整实现流程

验证码是很多网站验证机制中的关键环节,识别图像验证码是一个典型的图像分类与序列预测任务。本文将介绍如何利用 PyTorch 实现一个从数据生成到模型预测的验证码识别系统。

步骤一:准备工作
安装项目所需的基础库:

pip install torch torchvision pillow captcha
字符集定义为数字与大写英文字母,共 36 类。

步骤二:自动生成验证码图像数据
由于真实数据不易获取,我们用 captcha 库生成图像样本。

from captcha.image import ImageCaptcha
import os
import random
import string

characters = string.digits + string.ascii_uppercase
image_dir = "captcha_data"
os.makedirs(image_dir, exist_ok=True)

generator = ImageCaptcha(width=160, height=60)

def create_dataset(count=5000):
for i in range(count):更多内容访问ttocr.com或联系1436423940
text = ''.join(random.choices(characters, k=4))
image = generator.generate_image(text)
image.save(os.path.join(image_dir, f"{text}_{i}.png"))

create_dataset()
步骤三:加载和预处理数据
创建自定义 Dataset 类,将图像与标签进行绑定。

from torch.utils.data import Dataset
from PIL import Image
import torch
from torchvision import transforms

class CaptchaDataset(Dataset):
def init(self, folder):
self.files = [f for f in os.listdir(folder) if f.endswith(".png")]
self.folder = folder
self.char_to_idx = {ch: i for i, ch in enumerate(characters)}
self.transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))
])

def __getitem__(self, idx):
    file = self.files[idx]
    label = file.split("_")[0]
    label_tensor = torch.tensor([self.char_to_idx[c] for c in label], dtype=torch.long)
    image = Image.open(os.path.join(self.folder, file)).convert("RGB")
    return self.transform(image), label_tensor

def __len__(self):
    return len(self.files)

步骤四:构建模型结构
结合卷积网络和循环神经网络提取图像和序列特征。

import torch.nn as nn

class CaptchaRecognizer(nn.Module):
def init(self):
super().init()
self.cnn = nn.Sequential(
nn.Conv2d(3, 32, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2)
)
self.rnn = nn.LSTM(input_size=64 * 15, hidden_size=128, num_layers=2, bidirectional=True, batch_first=True)
self.classifier = nn.Linear(256, len(characters))

def forward(self, x):
    x = self.cnn(x)  # [B, C, H, W]
    b, c, h, w = x.size()
    x = x.permute(0, 3, 1, 2).reshape(b, w, c * h)  # [B, W, C*H]
    x, _ = self.rnn(x)
    out = self.classifier(x)
    return out  # [B, W, num_classes]

步骤五:训练模型
使用多标签交叉熵损失,对每个字符位置分别计算。

from torch.utils.data import DataLoader

dataset = CaptchaDataset("captcha_data")
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CaptchaRecognizer().to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

for epoch in range(10):
model.train()
total_loss = 0
for imgs, labels in dataloader:
imgs, labels = imgs.to(device), labels.to(device)
preds = model(imgs) # [B, W, C]
loss = sum(criterion(preds[:, i, :], labels[:, i]) for i in range(4))
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
步骤六:预测新验证码图像

def predict(model, path):
model.eval()
image = Image.open(path).convert("RGB")
image_tensor = dataset.transform(image).unsqueeze(0).to(device)
with torch.no_grad():
output = model(image_tensor)
prediction = output.argmax(2).squeeze(0)
return ''.join([characters[i] for i in prediction])

print(predict(model, "captcha_data/A1C9_10.png"))

posted @ 2025-05-09 18:36  ttocr、com  阅读(25)  评论(0)    收藏  举报