手写汉字识别

-- coding: utf-8 --

"""
Created on Thu Nov 13 23:49:06 2025

@author: cxy-fairytale
"""

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from PIL import Image, ImageDraw
import os

设置随机种子以确保结果可重现

torch.manual_seed(42)
np.random.seed(42)

检查是否有GPU可用

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

class BalancedChineseCharacterCNN(nn.Module):
def init(self, num_classes):
super(BalancedChineseCharacterCNN, self).init()
# 平衡的网络结构
self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
self.bn1 = nn.BatchNorm2d(32)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
self.bn2 = nn.BatchNorm2d(64)
self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.bn3 = nn.BatchNorm2d(128)

    self.pool = nn.MaxPool2d(2, 2)
    self.dropout1 = nn.Dropout(0.3)
    self.dropout2 = nn.Dropout(0.5)
    
    # 使用自适应池化层避免尺寸计算问题
    self.adaptive_pool = nn.AdaptiveAvgPool2d((4, 4))
    
    # 平衡的全连接层
    self.fc1 = nn.Linear(128 * 4 * 4, 256)
    self.fc2 = nn.Linear(256, num_classes)

def forward(self, x):
    # 卷积层1
    x = F.relu(self.bn1(self.conv1(x)))
    x = self.pool(x)
    x = self.dropout1(x)
    
    # 卷积层2
    x = F.relu(self.bn2(self.conv2(x)))
    x = self.pool(x)
    x = self.dropout1(x)
    
    # 卷积层3
    x = F.relu(self.bn3(self.conv3(x)))
    x = self.pool(x)
    
    # 使用自适应池化层
    x = self.adaptive_pool(x)
    
    # 展平
    x = x.view(x.size(0), -1)
    
    # 全连接层
    x = F.relu(self.fc1(x))
    x = self.dropout2(x)
    x = self.fc2(x)
    
    return x

def create_balanced_chinese_characters(num_samples=1200, img_size=32, num_classes=10):
"""
创建平衡的合成汉字数据集 - 适度难度
"""
print("正在生成平衡的汉字数据...")

characters = []
labels = []

# 平衡的汉字结构模板
character_templates = [
    # 0: 横竖结构 (如"十")
    lambda draw, size: [
        draw.line([(size//4, size//2), (3*size//4, size//2)], fill=255, width=2),
        draw.line([(size//2, size//4), (size//2, 3*size//4)], fill=255, width=2)
    ],
    # 1: 横结构 (如"一")
    lambda draw, size: [
        draw.line([(size//4, size//2), (3*size//4, size//2)], fill=255, width=3)
    ],
    # 2: 竖结构 (如"丨")
    lambda draw, size: [
        draw.line([(size//2, size//4), (size//2, 3*size//4)], fill=255, width=3)
    ],
    # 3: 方框结构 (如"口")
    lambda draw, size: [
        draw.rectangle([(size//4, size//4), (3*size//4, 3*size//4)], outline=255, width=2)
    ],
    # 4: 横折结构 (如"厂")
    lambda draw, size: [
        draw.line([(size//4, size//3), (3*size//4, size//3)], fill=255, width=2),
        draw.line([(size//4, size//3), (size//4, 3*size//4)], fill=255, width=2)
    ],
    # 5: 点横结构 (如"卜")
    lambda draw, size: [
        draw.line([(size//4, size//2), (3*size//4, size//2)], fill=255, width=2),
        draw.ellipse([(size//2-2, size//3-2), (size//2+2, size//3+2)], fill=255)
    ],
    # 6: 交叉结构 (如"井")
    lambda draw, size: [
        draw.line([(size//4, size//3), (3*size//4, size//3)], fill=255, width=1),
        draw.line([(size//4, 2*size//3), (3*size//4, 2*size//3)], fill=255, width=1),
        draw.line([(size//3, size//4), (size//3, 3*size//4)], fill=255, width=1),
        draw.line([(2*size//3, size//4), (2*size//3, 3*size//4)], fill=255, width=1)
    ],
    # 7: 人字结构 (如"人")
    lambda draw, size: [
        draw.line([(size//4, 3*size//4), (size//2, size//4)], fill=255, width=2),
        draw.line([(size//2, size//4), (3*size//4, 3*size//4)], fill=255, width=2)
    ],
    # 8: 三点水结构 (如"氵")
    lambda draw, size: [
        draw.ellipse([(size//3-2, size//4-2), (size//3+2, size//4+2)], fill=255),
        draw.ellipse([(size//3-2, size//2-2), (size//3+2, size//2+2)], fill=255),
        draw.ellipse([(size//3-2, 3*size//4-2), (size//3+2, 3*size//4+2)], fill=255)
    ],
    # 9: 左右结构 (如"休")
    lambda draw, size: [
        draw.line([(size//4, size//4), (size//4, 3*size//4)], fill=255, width=2),
        draw.line([(size//2, size//4), (3*size//4, size//2)], fill=255, width=2),
        draw.line([(size//2, 3*size//4), (3*size//4, 3*size//4)], fill=255, width=2)
    ]
]

for i in range(num_samples):
    # 创建一个空图像
    img = Image.new('L', (img_size, img_size), color=0)
    draw = ImageDraw.Draw(img)
    
    # 随机选择一个类别
    label = np.random.randint(0, num_classes)
    
    # 使用预定义的模板绘制汉字结构
    template = character_templates[label]
    drawing_commands = template(draw, img_size)
    
    # 执行绘制命令
    for command in drawing_commands:
        pass  # 命令已经在template中执行了
    
    # 转换为numpy数组并归一化
    img_array = np.array(img) / 255.0
    
    # 添加适量噪声以控制难度
    noise_intensity = np.random.uniform(0.05, 0.15)  # 随机噪声强度
    noise = np.random.normal(0, noise_intensity, (img_size, img_size))
    img_array = img_array + noise
    img_array = np.clip(img_array, 0, 1)
    
    # 随机添加轻微干扰
    if np.random.random() > 0.8:  # 20%的概率添加轻微干扰
        x1, y1 = np.random.randint(0, img_size, 2)
        x2, y2 = np.random.randint(0, img_size, 2)
        # 使用PIL在图像上添加干扰线
        temp_img = Image.fromarray((img_array * 255).astype(np.uint8))
        temp_draw = ImageDraw.Draw(temp_img)
        temp_draw.line([(x1, y1), (x2, y2)], fill=64, width=1)  # 浅灰色干扰
        img_array = np.array(temp_img) / 255.0
    
    characters.append(img_array)
    labels.append(label)

characters = np.array(characters).reshape(-1, 1, img_size, img_size)
labels = np.array(labels)

print(f"生成了 {len(characters)} 个样本，{num_classes} 个类别")
return torch.FloatTensor(characters), torch.LongTensor(labels)

def train_balanced_model(model, train_loader, val_loader, num_epochs=20, learning_rate=0.001):
"""训练平衡模型"""
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)

# 使用StepLR调度器
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.5)

train_losses = []
val_accuracies = []

print("开始训练模型...")
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    # 计算验证集准确率
    model.eval()
    val_predictions = []
    val_targets = []
    
    with torch.no_grad():
        for data, target in val_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.argmax(dim=1, keepdim=True)
            val_predictions.extend(pred.cpu().numpy())
            val_targets.extend(target.cpu().numpy())
    
    val_accuracy = accuracy_score(val_targets, val_predictions)
    avg_loss = running_loss / len(train_loader)
    train_losses.append(avg_loss)
    val_accuracies.append(val_accuracy)
    
    # 更新学习率
    scheduler.step()
    
    current_lr = optimizer.param_groups[0]['lr']
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, LR: {current_lr:.6f}')

return train_losses, val_accuracies

def evaluate_model(model, test_loader):
"""评估模型性能"""
model.eval()
all_predictions = []
all_targets = []

with torch.no_grad():
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        output = model(data)
        pred = output.argmax(dim=1, keepdim=True)
        all_predictions.extend(pred.cpu().numpy())
        all_targets.extend(target.cpu().numpy())

accuracy = accuracy_score(all_targets, all_predictions)

print(f"\n测试准确率: {accuracy:.4f}")
print("\n分类报告:")
print(classification_report(all_targets, all_predictions))

return accuracy, all_predictions, all_targets

def plot_balanced_results(train_losses, val_accuracies, test_accuracy, all_predictions, all_targets, num_classes=10):
"""绘制平衡结果"""
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# 绘制损失曲线
axes[0, 0].plot(train_losses, label='Training Loss')
axes[0, 0].set_title('Training Loss')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True)

# 绘制准确率曲线
axes[0, 1].plot(val_accuracies, label='Validation Accuracy', color='orange')
axes[0, 1].axhline(y=test_accuracy, color='r', linestyle='--', label=f'Test Accuracy: {test_accuracy:.4f}')
axes[0, 1].set_title('Accuracy')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True)

# 绘制混淆矩阵
cm = confusion_matrix(all_targets, all_predictions)
im = axes[1, 0].imshow(cm, interpolation='nearest', cmap='Blues')
axes[1, 0].set_title('Confusion Matrix')
axes[1, 0].set_xlabel('Predicted')
axes[1, 0].set_ylabel('Actual')
plt.colorbar(im, ax=axes[1, 0])

# 在混淆矩阵中添加数值
for i in range(num_classes):
    for j in range(num_classes):
        axes[1, 0].text(j, i, str(cm[i, j]), 
                       ha="center", va="center", 
                       color="white" if cm[i, j] > cm.max()/2 else "black")

# 绘制类别准确率分布
class_accuracies = []
for i in range(num_classes):
    class_mask = np.array(all_targets) == i
    if np.sum(class_mask) > 0:
        class_accuracy = accuracy_score(
            np.array(all_targets)[class_mask], 
            np.array(all_predictions)[class_mask]
        )
        class_accuracies.append(class_accuracy)
    else:
        class_accuracies.append(0)

axes[1, 1].bar(range(len(class_accuracies)), class_accuracies)
axes[1, 1].set_title('Class-wise Accuracy')
axes[1, 1].set_xlabel('Class')
axes[1, 1].set_ylabel('Accuracy')
axes[1, 1].set_ylim(0, 1)

# 在柱状图上添加数值
for i, v in enumerate(class_accuracies):
    axes[1, 1].text(i, v + 0.01, f'{v:.2f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

def visualize_predictions(model, test_loader, num_samples=10):
"""可视化一些预测结果"""
model.eval()
data_iter = iter(test_loader)
images, labels = next(data_iter)

images, labels = images.to(device), labels.to(device)

with torch.no_grad():
    outputs = model(images[:num_samples])
    _, predicted = torch.max(outputs, 1)
    probabilities = F.softmax(outputs, dim=1)

fig, axes = plt.subplots(2, 5, figsize=(15, 6))
axes = axes.ravel()

for i in range(num_samples):
    img = images[i].cpu().squeeze().numpy()
    axes[i].imshow(img, cmap='gray')
    correct = predicted[i].item() == labels[i].item()
    color = 'green' if correct else 'red'
    axes[i].set_title(f'True: {labels[i].item()}, Pred: {predicted[i].item()}\nProb: {probabilities[i][predicted[i]].item():.3f}', color=color)
    axes[i].axis('off')

plt.tight_layout()
plt.show()

def main():
# 平衡的参数设置
img_size = 32 # 图像尺寸
num_classes = 10 # 类别数量
batch_size = 32 # 批大小
num_epochs = 20 # 训练轮数

print("=" * 50)
print("汉字识别系统 - 80%准确率版本")
print("=" * 50)

# 创建平衡的数据集
X, y = create_balanced_chinese_characters(num_samples=1200, img_size=img_size, num_classes=num_classes)
dataset = TensorDataset(X, y)

# 分割数据集
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(
    dataset, [train_size, val_size, test_size]
)

# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"训练集大小: {len(train_dataset)}")
print(f"验证集大小: {len(val_dataset)}")
print(f"测试集大小: {len(test_dataset)}")

# 创建平衡的模型
model = BalancedChineseCharacterCNN(num_classes=num_classes)
print(f"模型参数量: {sum(p.numel() for p in model.parameters()):,}")

# 训练模型
train_losses, val_accuracies = train_balanced_model(
    model, train_loader, val_loader, num_epochs=num_epochs
)

# 评估模型
test_accuracy, predictions, targets = evaluate_model(model, test_loader)

# 可视化一些预测结果
visualize_predictions(model, test_loader)

# 绘制结果
plot_balanced_results(train_losses, val_accuracies, test_accuracy, predictions, targets, num_classes)

# 保存模型
torch.save({
    'model_state_dict': model.state_dict(),
    'num_classes': num_classes,
    'img_size': img_size,
    'test_accuracy': test_accuracy
}, 'balanced_chinese_character_cnn.pth')
print(f"\n模型已保存为 'balanced_chinese_character_cnn.pth'")
print(f"最终测试准确率: {test_accuracy:.4f}")

if name == "main":
main()

posted @ 2025-11-13 23:50 鳞* 阅读(0) 评论(0) 收藏举报

刷新页面返回顶部

fairytale-JJ

手写汉字识别

-- coding: utf-8 --

设置随机种子以确保结果可重现

检查是否有GPU可用

公告