深度学习验证码识别实战:从零构建端到端识别系统
- 项目概述与准备工作
本教程将带你完整实现一个基于深度学习的验证码识别系统,使用PyTorch框架从零开始构建。我们会涵盖数据生成、模型设计、训练优化和部署应用全流程。
1.1 环境配置
首先确保安装以下Python库:
bash
pip install torch torchvision pillow matplotlib numpy captcha flask
1.2 验证码生成器
我们首先生成训练数据。创建一个captcha_generator.py:
python
from captcha.image import ImageCaptcha
import random
import string
import os
网站地址www.tmocr.com或联系q1092685548
配置参数
CHAR_SET = string.digits + string.ascii_uppercase # 使用数字和大写字母
CAPTCHA_LENGTH = 5 # 验证码长度
IMAGE_WIDTH, IMAGE_HEIGHT = 180, 60 # 图片尺寸
DATA_DIR = "captcha_data" # 数据存储目录
TRAIN_RATIO = 0.8 # 训练集比例
def generate_captcha_dataset(num_samples=20000):
if not os.path.exists(DATA_DIR):
os.makedirs(DATA_DIR)
os.makedirs(os.path.join(DATA_DIR, "train"))
os.makedirs(os.path.join(DATA_DIR, "test"))
generator = ImageCaptcha(width=IMAGE_WIDTH, height=IMAGE_HEIGHT)
for i in range(num_samples):
captcha_text = ''.join(random.choices(CHAR_SET, k=CAPTCHA_LENGTH))
image = generator.generate_image(captcha_text)
# 按比例分配到训练集和测试集
if random.random() < TRAIN_RATIO:
save_path = os.path.join(DATA_DIR, "train", f"{captcha_text}_{i}.png")
else:
save_path = os.path.join(DATA_DIR, "test", f"{captcha_text}_{i}.png")
image.save(save_path)
if (i+1) % 1000 == 0:
print(f"已生成 {i+1} 张验证码")
if name == "main":
generate_captcha_dataset(20000)
print("验证码数据集生成完成!")
运行此脚本将生成2万张验证码图片,并按8:2的比例分为训练集和测试集。
- 数据加载与预处理
创建data_loader.py处理数据:
python
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
import numpy as np
from torchvision import transforms
class CaptchaDataset(Dataset):
def init(self, data_dir, char_set, transform=None):
self.data_dir = data_dir
self.image_files = [f for f in os.listdir(data_dir) if f.endswith('.png')]
self.transform = transform
self.char_to_idx = {c: i for i, c in enumerate(char_set)}
self.idx_to_char = {i: c for i, c in enumerate(char_set)}
def __len__(self):
return len(self.image_files)
def __getitem__(self, idx):
img_path = os.path.join(self.data_dir, self.image_files[idx])
image = Image.open(img_path).convert('RGB')
label_str = self.image_files[idx].split('_')[0]
# 将标签转为数字索引
label = [self.char_to_idx[c] for c in label_str]
if self.transform:
image = self.transform(image)
return image, torch.tensor(label, dtype=torch.long), label_str
def get_data_loaders(batch_size=64):
# 数据增强
transform = transforms.Compose([
transforms.ColorJitter(brightness=0.2, contrast=0.2),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
# 加载数据集
train_dataset = CaptchaDataset(
os.path.join("captcha_data", "train"),
CHAR_SET,
transform
)
test_dataset = CaptchaDataset(
os.path.join("captcha_data", "test"),
CHAR_SET,
transform
)
# 创建DataLoader
train_loader = DataLoader(
train_dataset,
batch_size=batch_size,
shuffle=True,
num_workers=4,
pin_memory=True
)
test_loader = DataLoader(
test_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=4,
pin_memory=True
)
return train_loader, test_loader
- 模型构建
创建model.py定义我们的深度学习模型:
python
import torch.nn as nn
import torch.nn.functional as F
class CaptchaModel(nn.Module):
def init(self, num_chars):
super(CaptchaModel, self).init()
# CNN特征提取部分
self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
self.bn1 = nn.BatchNorm2d(32)
self.pool1 = nn.MaxPool2d(2)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
self.bn2 = nn.BatchNorm2d(64)
self.pool2 = nn.MaxPool2d(2)
self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.bn3 = nn.BatchNorm2d(128)
self.pool3 = nn.MaxPool2d(2)
# 计算卷积后的特征图尺寸
self.conv_output_size = self._get_conv_output_size()
# RNN序列建模部分
self.gru = nn.GRU(
input_size=self.conv_output_size,
hidden_size=128,
num_layers=2,
bidirectional=True,
dropout=0.2
)
# 输出层
self.fc = nn.Linear(256, num_chars)
def _get_conv_output_size(self):
# 模拟计算卷积后的特征图尺寸
with torch.no_grad():
dummy_input = torch.zeros(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH)
output = self.pool1(self.bn1(F.relu(self.conv1(dummy_input))))
output = self.pool2(self.bn2(F.relu(self.conv2(output)))))
output = self.pool3(self.bn3(F.relu(self.conv3(output)))))
return output.size(1) * output.size(2)
def forward(self, x):
# CNN部分
x = self.pool1(F.relu(self.bn1(self.conv1(x))))
x = self.pool2(F.relu(self.bn2(self.conv2(x)))))
x = self.pool3(F.relu(self.bn3(self.conv3(x)))))
# 调整维度适应RNN
batch_size = x.size(0)
x = x.permute(0, 3, 1, 2) # [B, W, C, H]
x = x.reshape(batch_size, -1, self.conv_output_size)
# RNN部分
x, _ = self.gru(x)
# 输出层
x = self.fc(x)
x = x.permute(1, 0, 2) # CTC Loss需要的维度 [T, B, C]
return x
- 训练与验证
创建train.py实现训练流程:
python
import torch
from torch import nn
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from data_loader import get_data_loaders
from model import CaptchaModel
import time
import os
训练参数
BATCH_SIZE = 64
EPOCHS = 30
LEARNING_RATE = 0.001
GRAD_CLIP = 5
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MODEL_SAVE_PATH = "captcha_model.pth"
初始化
model = CaptchaModel(len(CHAR_SET)).to(DEVICE)
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CTCLoss(blank=len(CHAR_SET)) # 使用CTC Loss处理不定长序列
train_loader, test_loader = get_data_loaders(BATCH_SIZE)
def train():
model.train()
total_loss = 0
correct = 0
total = 0
for batch_idx, (images, targets, target_strs) in enumerate(train_loader):
images = images.to(DEVICE)
targets = targets.to(DEVICE)
optimizer.zero_grad()
outputs = model(images)
# 准备CTC Loss需要的输入
input_lengths = torch.full(
size=(outputs.size(1),),
fill_value=outputs.size(0),
dtype=torch.long
)
target_lengths = torch.full(
size=(targets.size(0),),
fill_value=targets.size(1),
dtype=torch.long
)
loss = criterion(outputs, targets, input_lengths, target_lengths)
loss.backward()
clip_grad_norm_(model.parameters(), GRAD_CLIP)
optimizer.step()
total_loss += loss.item()
# 计算准确率
_, predicted = torch.max(outputs.permute(1, 0, 2), 2)
predicted = predicted.transpose(0, 1).contiguous().view(-1, CAPTCHA_LENGTH)
for i in range(len(target_strs)):
pred_str = ''.join([IDX_TO_CHAR[p] for p in predicted[i]])
if pred_str == target_strs[i]:
correct += 1
total += 1
if (batch_idx+1) % 100 == 0:
print(f"Batch {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}")
avg_loss = total_loss / len(train_loader)
accuracy = correct / total
return avg_loss, accuracy
def evaluate():
model.eval()
total_loss = 0
correct = 0
total = 0
with torch.no_grad():
for images, targets, target_strs in test_loader:
images = images.to(DEVICE)
targets = targets.to(DEVICE)
outputs = model(images)
# 准备CTC Loss需要的输入
input_lengths = torch.full(
size=(outputs.size(1),),
fill_value=outputs.size(0),
dtype=torch.long
)
target_lengths = torch.full(
size=(targets.size(0),),
fill_value=targets.size(1),
dtype=torch.long
)
loss = criterion(outputs, targets, input_lengths, target_lengths)
total_loss += loss.item()
# 计算准确率
_, predicted = torch.max(outputs.permute(1, 0, 2), 2)
predicted = predicted.transpose(0, 1).contiguous().view(-1, CAPTCHA_LENGTH)
for i in range(len(target_strs)):
pred_str = ''.join([IDX_TO_CHAR[p] for p in predicted[i]])
if pred_str == target_strs[i]:
correct += 1
total += 1
avg_loss = total_loss / len(test_loader)
accuracy = correct / total
return avg_loss, accuracy
def main():
best_accuracy = 0
for epoch in range(1, EPOCHS+1):
start_time = time.time()
train_loss, train_acc = train()
val_loss, val_acc = evaluate()
end_time = time.time()
print(f"Epoch {epoch}/{EPOCHS} | Time: {end_time-start_time:.2f}s")
print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
# 保存最佳模型
if val_acc > best_accuracy:
best_accuracy = val_acc
torch.save(model.state_dict(), MODEL_SAVE_PATH)
print(f"New best model saved with accuracy {val_acc:.4f}")
if name == "main":
main()
5. 模型部署与应用
创建app.py实现一个简单的Web服务:
python
from flask import Flask, request, jsonify
from PIL import Image
import io
import torch
from model import CaptchaModel
from torchvision import transforms
app = Flask(name)
加载模型
model = CaptchaModel(len(CHAR_SET))
model.load_state_dict(torch.load("captcha_model.pth", map_location='cpu'))
model.eval()
图像预处理
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
@app.route('/predict', methods=['POST'])
def predict():
if 'file' not in request.files:
return jsonify({'error': 'No file uploaded'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'Empty filename'}), 400
try:
# 读取并预处理图像
image = Image.open(io.BytesIO(file.read())).convert('RGB')
image = transform(image).unsqueeze(0)
# 预测
with torch.no_grad():
output = model(image)
_, predicted = torch.max(output.permute(1, 0, 2), 2)
predicted = predicted.transpose(0, 1).contiguous().view(-1, CAPTCHA_LENGTH)
captcha_text = ''.join([IDX_TO_CHAR[p] for p in predicted[0]])
return jsonify({'prediction': captcha_text})
except Exception as e:
return jsonify({'error': str(e)}), 500
if name == 'main':
app.run(host='0.0.0.0', port=5000)
6. 项目扩展与优化
6.1 数据增强改进
在data_loader.py中增强数据多样性:
python
transform = transforms.Compose([
transforms.RandomRotation(10),
transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
transforms.GaussianBlur(kernel_size=(3, 3), sigma=(0.1, 2.0)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
6.2 模型架构优化
尝试更先进的模型结构:
python
class ImprovedCaptchaModel(nn.Module):
def init(self, num_chars):
super().init()
# 使用ResNet风格的残差块
self.conv_block1 = nn.Sequential(
nn.Conv2d(3, 32, 3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.Conv2d(32, 32, 3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(2)
)
self.conv_block2 = nn.Sequential(
nn.Conv2d(32, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(64, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2)
)
# 加入注意力机制
self.attention = nn.Sequential(
nn.Conv2d(64, 1, 1),
nn.Sigmoid()
)
# 使用BiGRU替代标准GRU
self.gru = nn.GRU(
input_size=64*(IMAGE_HEIGHT//4)*(IMAGE_WIDTH//4),
hidden_size=256,
num_layers=3,
bidirectional=True,
dropout=0.3
)
self.fc = nn.Linear(512, num_chars)
6.3 部署优化
使用ONNX加速推理:
python
导出模型为ONNX格式
dummy_input = torch.randn(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH)
torch.onnx.export(
model,
dummy_input,
"captcha_model.onnx",
input_names=["input"],
output_names=["output"],
dynamic_axes={
"input": {0: "batch_size"},
"output": {1: "batch_size"}
}
)
使用ONNX Runtime进行推理
import onnxruntime as ort
ort_session = ort.InferenceSession("captcha_model.onnx")
def predict_onnx(image):
input_name = ort_session.get_inputs()[0].name
output_name = ort_session.get_outputs()[0].name
# 预处理图像
image = transform(image).unsqueeze(0).numpy()
# 运行推理
outputs = ort_session.run([output_name], {input_name: image})
outputs = torch.tensor(outputs[0])
# 解码预测结果
_, predicted = torch.max(outputs, 2)
predicted = predicted.squeeze(1)
captcha_text = ''.join([IDX_TO_CHAR[p] for p in predicted])
return captcha_text

浙公网安备 33010602011771号