手写汉字识别
-- coding: utf-8 --
"""
Created on Thu Nov 13 23:49:06 2025
@author: cxy-fairytale
"""
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from PIL import Image, ImageDraw
import os
设置随机种子以确保结果可重现
torch.manual_seed(42)
np.random.seed(42)
检查是否有GPU可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")
class BalancedChineseCharacterCNN(nn.Module):
def init(self, num_classes):
super(BalancedChineseCharacterCNN, self).init()
# 平衡的网络结构
self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
self.bn1 = nn.BatchNorm2d(32)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
self.bn2 = nn.BatchNorm2d(64)
self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.bn3 = nn.BatchNorm2d(128)
self.pool = nn.MaxPool2d(2, 2)
self.dropout1 = nn.Dropout(0.3)
self.dropout2 = nn.Dropout(0.5)
# 使用自适应池化层避免尺寸计算问题
self.adaptive_pool = nn.AdaptiveAvgPool2d((4, 4))
# 平衡的全连接层
self.fc1 = nn.Linear(128 * 4 * 4, 256)
self.fc2 = nn.Linear(256, num_classes)
def forward(self, x):
# 卷积层1
x = F.relu(self.bn1(self.conv1(x)))
x = self.pool(x)
x = self.dropout1(x)
# 卷积层2
x = F.relu(self.bn2(self.conv2(x)))
x = self.pool(x)
x = self.dropout1(x)
# 卷积层3
x = F.relu(self.bn3(self.conv3(x)))
x = self.pool(x)
# 使用自适应池化层
x = self.adaptive_pool(x)
# 展平
x = x.view(x.size(0), -1)
# 全连接层
x = F.relu(self.fc1(x))
x = self.dropout2(x)
x = self.fc2(x)
return x
def create_balanced_chinese_characters(num_samples=1200, img_size=32, num_classes=10):
"""
创建平衡的合成汉字数据集 - 适度难度
"""
print("正在生成平衡的汉字数据...")
characters = []
labels = []
# 平衡的汉字结构模板
character_templates = [
# 0: 横竖结构 (如"十")
lambda draw, size: [
draw.line([(size//4, size//2), (3*size//4, size//2)], fill=255, width=2),
draw.line([(size//2, size//4), (size//2, 3*size//4)], fill=255, width=2)
],
# 1: 横结构 (如"一")
lambda draw, size: [
draw.line([(size//4, size//2), (3*size//4, size//2)], fill=255, width=3)
],
# 2: 竖结构 (如"丨")
lambda draw, size: [
draw.line([(size//2, size//4), (size//2, 3*size//4)], fill=255, width=3)
],
# 3: 方框结构 (如"口")
lambda draw, size: [
draw.rectangle([(size//4, size//4), (3*size//4, 3*size//4)], outline=255, width=2)
],
# 4: 横折结构 (如"厂")
lambda draw, size: [
draw.line([(size//4, size//3), (3*size//4, size//3)], fill=255, width=2),
draw.line([(size//4, size//3), (size//4, 3*size//4)], fill=255, width=2)
],
# 5: 点横结构 (如"卜")
lambda draw, size: [
draw.line([(size//4, size//2), (3*size//4, size//2)], fill=255, width=2),
draw.ellipse([(size//2-2, size//3-2), (size//2+2, size//3+2)], fill=255)
],
# 6: 交叉结构 (如"井")
lambda draw, size: [
draw.line([(size//4, size//3), (3*size//4, size//3)], fill=255, width=1),
draw.line([(size//4, 2*size//3), (3*size//4, 2*size//3)], fill=255, width=1),
draw.line([(size//3, size//4), (size//3, 3*size//4)], fill=255, width=1),
draw.line([(2*size//3, size//4), (2*size//3, 3*size//4)], fill=255, width=1)
],
# 7: 人字结构 (如"人")
lambda draw, size: [
draw.line([(size//4, 3*size//4), (size//2, size//4)], fill=255, width=2),
draw.line([(size//2, size//4), (3*size//4, 3*size//4)], fill=255, width=2)
],
# 8: 三点水结构 (如"氵")
lambda draw, size: [
draw.ellipse([(size//3-2, size//4-2), (size//3+2, size//4+2)], fill=255),
draw.ellipse([(size//3-2, size//2-2), (size//3+2, size//2+2)], fill=255),
draw.ellipse([(size//3-2, 3*size//4-2), (size//3+2, 3*size//4+2)], fill=255)
],
# 9: 左右结构 (如"休")
lambda draw, size: [
draw.line([(size//4, size//4), (size//4, 3*size//4)], fill=255, width=2),
draw.line([(size//2, size//4), (3*size//4, size//2)], fill=255, width=2),
draw.line([(size//2, 3*size//4), (3*size//4, 3*size//4)], fill=255, width=2)
]
]
for i in range(num_samples):
# 创建一个空图像
img = Image.new('L', (img_size, img_size), color=0)
draw = ImageDraw.Draw(img)
# 随机选择一个类别
label = np.random.randint(0, num_classes)
# 使用预定义的模板绘制汉字结构
template = character_templates[label]
drawing_commands = template(draw, img_size)
# 执行绘制命令
for command in drawing_commands:
pass # 命令已经在template中执行了
# 转换为numpy数组并归一化
img_array = np.array(img) / 255.0
# 添加适量噪声以控制难度
noise_intensity = np.random.uniform(0.05, 0.15) # 随机噪声强度
noise = np.random.normal(0, noise_intensity, (img_size, img_size))
img_array = img_array + noise
img_array = np.clip(img_array, 0, 1)
# 随机添加轻微干扰
if np.random.random() > 0.8: # 20%的概率添加轻微干扰
x1, y1 = np.random.randint(0, img_size, 2)
x2, y2 = np.random.randint(0, img_size, 2)
# 使用PIL在图像上添加干扰线
temp_img = Image.fromarray((img_array * 255).astype(np.uint8))
temp_draw = ImageDraw.Draw(temp_img)
temp_draw.line([(x1, y1), (x2, y2)], fill=64, width=1) # 浅灰色干扰
img_array = np.array(temp_img) / 255.0
characters.append(img_array)
labels.append(label)
characters = np.array(characters).reshape(-1, 1, img_size, img_size)
labels = np.array(labels)
print(f"生成了 {len(characters)} 个样本,{num_classes} 个类别")
return torch.FloatTensor(characters), torch.LongTensor(labels)
def train_balanced_model(model, train_loader, val_loader, num_epochs=20, learning_rate=0.001):
"""训练平衡模型"""
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
# 使用StepLR调度器
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.5)
train_losses = []
val_accuracies = []
print("开始训练模型...")
for epoch in range(num_epochs):
model.train()
running_loss = 0.0
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
running_loss += loss.item()
# 计算验证集准确率
model.eval()
val_predictions = []
val_targets = []
with torch.no_grad():
for data, target in val_loader:
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
val_predictions.extend(pred.cpu().numpy())
val_targets.extend(target.cpu().numpy())
val_accuracy = accuracy_score(val_targets, val_predictions)
avg_loss = running_loss / len(train_loader)
train_losses.append(avg_loss)
val_accuracies.append(val_accuracy)
# 更新学习率
scheduler.step()
current_lr = optimizer.param_groups[0]['lr']
print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, LR: {current_lr:.6f}')
return train_losses, val_accuracies
def evaluate_model(model, test_loader):
"""评估模型性能"""
model.eval()
all_predictions = []
all_targets = []
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
all_predictions.extend(pred.cpu().numpy())
all_targets.extend(target.cpu().numpy())
accuracy = accuracy_score(all_targets, all_predictions)
print(f"\n测试准确率: {accuracy:.4f}")
print("\n分类报告:")
print(classification_report(all_targets, all_predictions))
return accuracy, all_predictions, all_targets
def plot_balanced_results(train_losses, val_accuracies, test_accuracy, all_predictions, all_targets, num_classes=10):
"""绘制平衡结果"""
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
# 绘制损失曲线
axes[0, 0].plot(train_losses, label='Training Loss')
axes[0, 0].set_title('Training Loss')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True)
# 绘制准确率曲线
axes[0, 1].plot(val_accuracies, label='Validation Accuracy', color='orange')
axes[0, 1].axhline(y=test_accuracy, color='r', linestyle='--', label=f'Test Accuracy: {test_accuracy:.4f}')
axes[0, 1].set_title('Accuracy')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True)
# 绘制混淆矩阵
cm = confusion_matrix(all_targets, all_predictions)
im = axes[1, 0].imshow(cm, interpolation='nearest', cmap='Blues')
axes[1, 0].set_title('Confusion Matrix')
axes[1, 0].set_xlabel('Predicted')
axes[1, 0].set_ylabel('Actual')
plt.colorbar(im, ax=axes[1, 0])
# 在混淆矩阵中添加数值
for i in range(num_classes):
for j in range(num_classes):
axes[1, 0].text(j, i, str(cm[i, j]),
ha="center", va="center",
color="white" if cm[i, j] > cm.max()/2 else "black")
# 绘制类别准确率分布
class_accuracies = []
for i in range(num_classes):
class_mask = np.array(all_targets) == i
if np.sum(class_mask) > 0:
class_accuracy = accuracy_score(
np.array(all_targets)[class_mask],
np.array(all_predictions)[class_mask]
)
class_accuracies.append(class_accuracy)
else:
class_accuracies.append(0)
axes[1, 1].bar(range(len(class_accuracies)), class_accuracies)
axes[1, 1].set_title('Class-wise Accuracy')
axes[1, 1].set_xlabel('Class')
axes[1, 1].set_ylabel('Accuracy')
axes[1, 1].set_ylim(0, 1)
# 在柱状图上添加数值
for i, v in enumerate(class_accuracies):
axes[1, 1].text(i, v + 0.01, f'{v:.2f}', ha='center', va='bottom')
plt.tight_layout()
plt.show()
def visualize_predictions(model, test_loader, num_samples=10):
"""可视化一些预测结果"""
model.eval()
data_iter = iter(test_loader)
images, labels = next(data_iter)
images, labels = images.to(device), labels.to(device)
with torch.no_grad():
outputs = model(images[:num_samples])
_, predicted = torch.max(outputs, 1)
probabilities = F.softmax(outputs, dim=1)
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
axes = axes.ravel()
for i in range(num_samples):
img = images[i].cpu().squeeze().numpy()
axes[i].imshow(img, cmap='gray')
correct = predicted[i].item() == labels[i].item()
color = 'green' if correct else 'red'
axes[i].set_title(f'True: {labels[i].item()}, Pred: {predicted[i].item()}\nProb: {probabilities[i][predicted[i]].item():.3f}', color=color)
axes[i].axis('off')
plt.tight_layout()
plt.show()
def main():
# 平衡的参数设置
img_size = 32 # 图像尺寸
num_classes = 10 # 类别数量
batch_size = 32 # 批大小
num_epochs = 20 # 训练轮数
print("=" * 50)
print("汉字识别系统 - 80%准确率版本")
print("=" * 50)
# 创建平衡的数据集
X, y = create_balanced_chinese_characters(num_samples=1200, img_size=img_size, num_classes=num_classes)
dataset = TensorDataset(X, y)
# 分割数据集
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(
dataset, [train_size, val_size, test_size]
)
# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
print(f"训练集大小: {len(train_dataset)}")
print(f"验证集大小: {len(val_dataset)}")
print(f"测试集大小: {len(test_dataset)}")
# 创建平衡的模型
model = BalancedChineseCharacterCNN(num_classes=num_classes)
print(f"模型参数量: {sum(p.numel() for p in model.parameters()):,}")
# 训练模型
train_losses, val_accuracies = train_balanced_model(
model, train_loader, val_loader, num_epochs=num_epochs
)
# 评估模型
test_accuracy, predictions, targets = evaluate_model(model, test_loader)
# 可视化一些预测结果
visualize_predictions(model, test_loader)
# 绘制结果
plot_balanced_results(train_losses, val_accuracies, test_accuracy, predictions, targets, num_classes)
# 保存模型
torch.save({
'model_state_dict': model.state_dict(),
'num_classes': num_classes,
'img_size': img_size,
'test_accuracy': test_accuracy
}, 'balanced_chinese_character_cnn.pth')
print(f"\n模型已保存为 'balanced_chinese_character_cnn.pth'")
print(f"最终测试准确率: {test_accuracy:.4f}")
if name == "main":
main()

浙公网安备 33010602011771号