手写汉字识别

import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset, random_split
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau
from torch.optim import AdamW, SGD
import torchvision.models as models
from PIL import Image
import numpy as np
import warnings
warnings.filterwarnings("ignore", message=".weights_only.")

设置随机种子以保证可重复性

def set_seed(seed=42):
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

set_seed(42)

定义生成图像集路径文档的函数

def classes_txt(root, out_path, num_class=None):
dirs = os.listdir(root)
if not num_class:
num_class = len(dirs)
if not os.path.exists(out_path):
with open(out_path, 'w') as f:
pass
with open(out_path, 'r+') as f:
try:
end = int(f.readlines()[-1].split(' ')[-1]) + 1
except:
end = 0
if end < num_class - 1:
dirs.sort()
dirs = dirs[end:num_class]
for dir in dirs:
files = os.listdir(os.path.join(root, dir))
for file in files:
f.write(os.path.join(root, dir, file) + '\n')

class MyDataset(Dataset):
def init(self, txt_path, num_class, transforms=None):
super().init()
images = []
labels = []
with open(txt_path, 'r') as f:
for line in f:
if int(line.split('\')[-2]) >= num_class:
break
line = line.strip('\n')
images.append(line)
labels.append(int(line.split('\')[-2]))
self.images = images
self.labels = labels
self.transforms = transforms

def __getitem__(self, index):
    image = Image.open(self.images[index]).convert('RGB')
    label = self.labels[index]
    if self.transforms is not None:
        image = self.transforms(image)
    return image, label

def __len__(self):
    return len(self.labels)

========== 高级数据增强 ==========

train_transform = transforms.Compose([
transforms.Resize((128, 128)), # 更大的图像尺寸
transforms.RandomRotation(15), # 更大的旋转角度
transforms.RandomAffine(degrees=0, translate=(0.15, 0.15), scale=(0.85, 1.15)), # 平移和缩放
transforms.RandomHorizontalFlip(p=0.5),
transforms.RandomVerticalFlip(p=0.05), # 垂直翻转（少量）
transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.2, hue=0.1),
transforms.RandomGrayscale(p=0.1), # 随机灰度化
transforms.RandomApply([transforms.GaussianBlur(3, sigma=(0.1, 2.0))], p=0.2), # 高斯模糊
transforms.Grayscale(num_output_channels=1),
transforms.ToTensor(),
transforms.Normalize(mean=[0.5], std=[0.5])
])

val_test_transform = transforms.Compose([
transforms.Resize((128, 128)),
transforms.Grayscale(num_output_channels=1),
transforms.ToTensor(),
transforms.Normalize(mean=[0.5], std=[0.5])
])

========== 高级模型架构 ==========

class AdvancedNet(nn.Module):
def init(self, num_classes=10, dropout_rate=0.5):
super(AdvancedNet, self).init()

    # 第一个卷积块
    self.conv1 = nn.Conv2d(1, 64, 3, padding=1)
    self.bn1 = nn.BatchNorm2d(64)
    self.conv2 = nn.Conv2d(64, 64, 3, padding=1)
    self.bn2 = nn.BatchNorm2d(64)
    self.pool1 = nn.MaxPool2d(2, 2)
    self.dropout1 = nn.Dropout2d(dropout_rate/2)
    
    # 第二个卷积块
    self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
    self.bn3 = nn.BatchNorm2d(128)
    self.conv4 = nn.Conv2d(128, 128, 3, padding=1)
    self.bn4 = nn.BatchNorm2d(128)
    self.pool2 = nn.MaxPool2d(2, 2)
    self.dropout2 = nn.Dropout2d(dropout_rate/2)
    
    # 第三个卷积块
    self.conv5 = nn.Conv2d(128, 256, 3, padding=1)
    self.bn5 = nn.BatchNorm2d(256)
    self.conv6 = nn.Conv2d(256, 256, 3, padding=1)
    self.bn6 = nn.BatchNorm2d(256)
    self.pool3 = nn.MaxPool2d(2, 2)
    self.dropout3 = nn.Dropout2d(dropout_rate/2)
    
    # 第四个卷积块
    self.conv7 = nn.Conv2d(256, 512, 3, padding=1)
    self.bn7 = nn.BatchNorm2d(512)
    self.conv8 = nn.Conv2d(512, 512, 3, padding=1)
    self.bn8 = nn.BatchNorm2d(512)
    self.pool4 = nn.AdaptiveAvgPool2d((4, 4))  # 自适应池化
    self.dropout4 = nn.Dropout2d(dropout_rate/2)
    
    # 全连接层
    self.fc1 = nn.Linear(512 * 4 * 4, 1024)
    self.bn9 = nn.BatchNorm1d(1024)
    self.dropout5 = nn.Dropout(dropout_rate)
    self.fc2 = nn.Linear(1024, 512)
    self.bn10 = nn.BatchNorm1d(512)
    self.dropout6 = nn.Dropout(dropout_rate)
    self.fc3 = nn.Linear(512, num_classes)

def forward(self, x):
    # 第一个卷积块
    x = F.relu(self.bn1(self.conv1(x)))
    x = F.relu(self.bn2(self.conv2(x)))
    x = self.pool1(x)
    x = self.dropout1(x)
    
    # 第二个卷积块
    x = F.relu(self.bn3(self.conv3(x)))
    x = F.relu(self.bn4(self.conv4(x)))
    x = self.pool2(x)
    x = self.dropout2(x)
    
    # 第三个卷积块
    x = F.relu(self.bn5(self.conv5(x)))
    x = F.relu(self.bn6(self.conv6(x)))
    x = self.pool3(x)
    x = self.dropout3(x)
    
    # 第四个卷积块
    x = F.relu(self.bn7(self.conv7(x)))
    x = F.relu(self.bn8(self.conv8(x)))
    x = self.pool4(x)
    x = self.dropout4(x)
    
    # 全连接层
    x = x.view(-1, 512 * 4 * 4)
    x = F.relu(self.bn9(self.fc1(x)))
    x = self.dropout5(x)
    x = F.relu(self.bn10(self.fc2(x)))
    x = self.dropout6(x)
    x = self.fc3(x)
    
    return x

========== 高级训练策略 ==========

def train_advanced_model(model, train_loader, val_loader, num_epochs=50):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# 使用AdamW优化器
optimizer = AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)

# 组合学习率调度器
scheduler1 = CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-6)
scheduler2 = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)

# 标签平滑的损失函数
class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, smoothing=0.1):
        super(LabelSmoothingCrossEntropy, self).__init__()
        self.smoothing = smoothing
    
    def forward(self, x, target):
        confidence = 1. - self.smoothing
        logprobs = F.log_softmax(x, dim=-1)
        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
        nll_loss = nll_loss.squeeze(1)
        smooth_loss = -logprobs.mean(dim=-1)
        loss = confidence * nll_loss + self.smoothing * smooth_loss
        return loss.mean()

criterion = LabelSmoothingCrossEntropy(smoothing=0.1)

best_acc = 0.0
patience_counter = 0
patience = 10  # 早停耐心值

for epoch in range(num_epochs):
    # 训练阶段
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0
    
    for i, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        
        # 梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        running_loss += loss.item()
        
        # 计算训练准确率
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
        
        if i % 50 == 49:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {running_loss/50:.4f}')
            running_loss = 0.0
    
    train_accuracy = 100 * correct_train / total_train
    
    # 验证阶段
    model.eval()
    correct_val = 0
    total_val = 0
    val_loss = 0.0
    
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()
    
    val_accuracy = 100 * correct_val / total_val
    avg_val_loss = val_loss / len(val_loader)
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Acc: {train_accuracy:.2f}%, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.2f}%')
    
    # 学习率调整
    scheduler1.step()
    scheduler2.step(avg_val_loss)
    
    # 保存最佳模型
    if val_accuracy > best_acc:
        best_acc = val_accuracy
        torch.save(model.state_dict(), 'best_model.pkl')
        print(f'New best model saved with validation accuracy: {best_acc:.2f}%')
        patience_counter = 0
    else:
        patience_counter += 1
    
    # 早停
    if patience_counter >= patience:
        print(f'Early stopping at epoch {epoch+1}')
        break

return best_acc

创建验证集

def create_datasets(train_txt_path, test_txt_path, num_classes=10):
# 加载完整训练集
full_train_set = MyDataset(train_txt_path, num_classes, transforms=train_transform)

# 划分训练集和验证集 (85% 训练, 15% 验证)
train_size = int(0.85 * len(full_train_set))
val_size = len(full_train_set) - train_size
train_set, val_set = random_split(full_train_set, [train_size, val_size])

# 测试集
test_set = MyDataset(test_txt_path, num_classes, transforms=val_test_transform)

return train_set, val_set, test_set

测试时间增强 (TTA)

def test_time_augmentation(model, dataloader, device, num_augmentations=5):
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for images, labels in dataloader:
        images, labels = images.to(device), labels.to(device)
        batch_predictions = []
        
        # 原始图像预测
        outputs = model(images)
        batch_predictions.append(F.softmax(outputs, dim=1))
        
        # 数据增强预测
        augmentations = [
            transforms.Compose([
                transforms.RandomRotation(10),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.5], std=[0.5])
            ]),
            transforms.Compose([
                transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.5], std=[0.5])
            ]),
            transforms.Compose([
                transforms.ColorJitter(brightness=0.2, contrast=0.2),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.5], std=[0.5])
            ])
        ]
        
        for aug in augmentations[:num_augmentations-1]:
            augmented_images = torch.stack([aug(transforms.ToPILImage()(img)) for img in images.cpu()])
            augmented_images = augmented_images.to(device)
            outputs = model(augmented_images)
            batch_predictions.append(F.softmax(outputs, dim=1))
        
        # 平均所有预测
        avg_predictions = torch.stack(batch_predictions).mean(dim=0)
        _, predicted = torch.max(avg_predictions, 1)
        
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = 100 * np.sum(np.array(all_predictions) == np.array(all_labels)) / len(all_labels)
return accuracy

主函数

def main():
# 使用英文路径避免编码问题
root = 'D:/深度学习/手写汉字识别/data'
model_save_dir = 'D:/深度学习/手写汉字识别/tmp'

# 首先生成TXT文件
classes_txt(root + '/train', root + '/train.txt', num_class=10)
classes_txt(root + '/test', root + '/test.txt', num_class=10)

# 创建数据集
train_set, val_set, test_set = create_datasets(
    root + '/train.txt', 
    root + '/test.txt', 
    num_classes=10
)

# 创建数据加载器 - 使用较小的批量大小
train_loader = DataLoader(train_set, batch_size=16, shuffle=True, num_workers=0)
val_loader = DataLoader(val_set, batch_size=16, shuffle=False, num_workers=0)
test_loader = DataLoader(test_set, batch_size=16, shuffle=False, num_workers=0)

print(f"训练集大小: {len(train_set)}")
print(f"验证集大小: {len(val_set)}")
print(f"测试集大小: {len(test_set)}")

# 创建高级模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AdvancedNet(num_classes=10, dropout_rate=0.5).to(device)
print("高级模型已创建，开始训练...")

# 训练模型
best_val_acc = train_advanced_model(
    model, train_loader, val_loader, num_epochs=100
)

# 加载最佳模型
model.load_state_dict(torch.load('best_model.pkl', weights_only=True))

# 在测试集上评估（不使用TTA）
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_accuracy = 100 * correct / total
print(f'标准测试集准确率: {test_accuracy:.2f}%')

# 使用测试时间增强(TTA)进行评估
tta_accuracy = test_time_augmentation(model, test_loader, device)
print(f'TTA测试集准确率: {tta_accuracy:.2f}%')

# 保存模型权重
os.makedirs(model_save_dir, exist_ok=True)
model_save_path = os.path.join(model_save_dir, 'advanced_model.pkl')
torch.save(model.state_dict(), model_save_path)
print(f'高级模型已保存到: {model_save_path}')

# 单张图像预测示例
test_img_path = 'D:/深度学习/手写汉字识别/data/test/00008/12313.png'
try:
    img = Image.open(test_img_path).convert('RGB')
    img_transform = transforms.Compose([
        transforms.Resize((128, 128)),
        transforms.Grayscale(num_output_channels=1),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5], std=[0.5])
    ])
    img_tensor = img_transform(img)
    img_tensor = img_tensor.unsqueeze(0).to(device)
    
    model.eval()
    with torch.no_grad():
        output = model(img_tensor)
        _, prediction = torch.max(output, 1)
        prediction = prediction.cpu().numpy()[0]
    
    print(f'单张图像预测结果: {prediction}')
except Exception as e:
    print(f'预测时出错: {e}')

if name == 'main':
main()

posted @ 2025-10-30 21:21 XiaoguoLu 阅读(15) 评论(0) 收藏举报

刷新页面返回顶部

xiaoguo1111