汉字识别
-- coding: utf-8 --
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import random
import os
import matplotlib.pyplot as plt
设置随机种子,确保结果可复现
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)
--------------------------
1. 生成汉字图像的工具函数
--------------------------
def generate_char_image(char, img_size=(64, 64), font_size=40, noise_level=0.05):
"""生成单个汉字的图像"""
img = Image.new('L', img_size, 255) # 白色背景灰度图
draw = ImageDraw.Draw(img)
# 尝试加载系统字体
try:
font_paths = [
"/System/Library/Fonts/PingFang.ttc", # macOS
"C:/Windows/Fonts/simhei.ttf", # Windows黑体
"C:/Windows/Fonts/simsun.ttc", # Windows宋体
"/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc" # Linux
]
font = None
for path in font_paths:
if os.path.exists(path):
font = ImageFont.truetype(path, font_size)
break
if not font:
font = ImageFont.load_default()
except:
font = ImageFont.load_default()
# 计算文字位置(居中显示)
bbox = draw.textbbox((0, 0), char, font=font)
char_width = bbox[2] - bbox[0]
char_height = bbox[3] - bbox[1]
x = (img_size[0] - char_width) // 2
y = (img_size[1] - char_height) // 2
# 绘制汉字
draw.text((x, y), char, font=font, fill=0)
# 添加随机噪声
if noise_level > 0:
img_np = np.array(img)
noise = np.random.normal(0, noise_level * 255, img_size).astype(np.int16)
img_np = np.clip(img_np + noise, 0, 255).astype(np.uint8)
img = Image.fromarray(img_np)
return img
--------------------------
2. 自定义汉字数据集类
--------------------------
class GeneratedChineseDataset(Dataset):
def init(self, chars, num_samples_per_char=1000, img_size=(64, 64), transform=None):
self.chars = chars
self.num_samples = len(chars) * num_samples_per_char
self.num_samples_per_char = num_samples_per_char
self.img_size = img_size
self.transform = transform
self.label_map = {char: i for i, char in enumerate(chars)}
def __len__(self):
return self.num_samples
def __getitem__(self, idx):
char_idx = idx // self.num_samples_per_char
char = self.chars[char_idx]
# 随机调整生成参数
font_size = random.randint(30, 45)
noise_level = random.uniform(0.02, 0.15)
# 生成汉字图像
image = generate_char_image(
char=char,
img_size=self.img_size,
font_size=font_size,
noise_level=noise_level
)
if self.transform:
image = self.transform(image)
label = self.label_map[char]
return image, label
--------------------------
3. 配置与初始化
--------------------------
创建保存模型和图像的目录
os.makedirs('models', exist_ok=True)
os.makedirs('plots', exist_ok=True)
要识别的汉字集合
target_chars = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
'百', '千', '万', '上', '下', '左', '右', '人', '口', '手']
num_classes = len(target_chars)
数据变换
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))
])
生成数据集
train_dataset = GeneratedChineseDataset(
chars=target_chars,
num_samples_per_char=800,
img_size=(64, 64),
transform=transform
)
test_dataset = GeneratedChineseDataset(
chars=target_chars,
num_samples_per_char=200,
img_size=(64, 64),
transform=transform
)
数据加载器
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
--------------------------
4. 定义模型
--------------------------
class CharRecognitionCNN(nn.Module):
def init(self, num_classes):
super(CharRecognitionCNN, self).init()
self.conv_layers = nn.Sequential(
nn.Conv2d(1, 32, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2), # 64x64 → 32x32
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2), # 32x32 → 16x16
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2, 2) # 16x16 → 8x8
)
self.fc_layers = nn.Sequential(
nn.Linear(128 * 8 * 8, 512),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(512, num_classes)
)
def forward(self, x):
x = self.conv_layers(x)
x = x.view(-1, 128 * 8 * 8)
x = self.fc_layers(x)
return x
--------------------------
5. 训练与测试函数(带可视化指标记录)
--------------------------
def train(model, train_loader, criterion, optimizer, epochs=10):
model.train()
train_losses = []
train_accuracies = []
for epoch in range(epochs):
running_loss = 0.0
correct = 0
total = 0
for batch_idx, (data, target) in enumerate(train_loader):
optimizer.zero_grad()
outputs = model(data)
loss = criterion(outputs, target)
loss.backward()
optimizer.step()
running_loss += loss.item()
# 计算训练准确率
_, predicted = torch.max(outputs.data, 1)
total += target.size(0)
correct += (predicted == target).sum().item()
# 每100个批次打印一次
if batch_idx % 100 == 99:
print(
f'Epoch [{epoch + 1}/{epochs}], Batch [{batch_idx + 1}/{len(train_loader)}], Loss: {running_loss / 100:.4f}')
running_loss = 0.0
# 计算每个epoch的平均损失和准确率
epoch_loss = (running_loss + (batch_idx % 100 + 1) * loss.item()) / len(train_loader)
epoch_acc = 100.0 * correct / total
train_losses.append(epoch_loss)
train_accuracies.append(epoch_acc)
print(f'Epoch [{epoch + 1}/{epochs}] 完成 - 平均损失: {epoch_loss:.4f}, 训练准确率: {epoch_acc:.2f}%')
return train_losses, train_accuracies
def test(model, test_loader, criterion, char_list):
model.eval()
test_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
for data, target in test_loader:
outputs = model(data)
test_loss += criterion(outputs, target).item()
_, predicted = torch.max(outputs.data, 1)
total += target.size(0)
correct += (predicted == target).sum().item()
# 计算指标
test_loss /= len(test_loader.dataset)
accuracy = 100.0 * correct / total
print(f'\n测试集:平均损失: {test_loss:.4f}, 准确率: {accuracy:.2f}%')
return accuracy, test_loss, data, target, predicted
--------------------------
6. 可视化函数
--------------------------
def plot_training_curves(train_losses, train_accuracies, test_accuracy, epochs):
"""绘制训练损失和准确率曲线"""
plt.figure(figsize=(12, 5))
# 损失曲线
plt.subplot(1, 2, 1)
plt.plot(range(1, epochs + 1), train_losses, marker='o', color='blue')
plt.title('训练损失曲线')
plt.xlabel('Epoch')
plt.ylabel('损失值')
plt.grid(True, linestyle='--', alpha=0.7)
# 准确率曲线
plt.subplot(1, 2, 2)
plt.plot(range(1, epochs + 1), train_accuracies, marker='o', color='orange', label='训练准确率')
plt.axhline(y=test_accuracy, color='red', linestyle='--', label=f'测试准确率: {test_accuracy:.2f}%')
plt.title('训练准确率曲线')
plt.xlabel('Epoch')
plt.ylabel('准确率 (%)')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('plots/training_curves.png')
plt.show()
def visualize_predictions(images, true_labels, pred_labels, char_list, num=6):
"""可视化预测结果"""
fig, axes = plt.subplots(1, num, figsize=(15, 3))
for i in range(num):
# 反归一化图像(从[-1,1]转回[0,1])
img = images[i][0].numpy()
img = (img + 1) / 2 # 反归一化
axes[i].imshow(img, cmap='gray')
true_char = char_list[true_labels[i].item()]
pred_char = char_list[pred_labels[i].item()]
# 正确预测为绿色标题,错误为红色
color = 'green' if true_char == pred_char else 'red'
axes[i].set_title(f"真实: {true_char}\n预测: {pred_char}", color=color)
axes[i].axis('off')
plt.tight_layout()
plt.savefig('plots/predictions.png')
plt.show()
--------------------------
7. 主程序
--------------------------
if name == 'main':
# 初始化模型、损失函数、优化器
model = CharRecognitionCNN(num_classes=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练模型
epochs = 8
print(f"开始训练(识别汉字:{target_chars})...")
train_losses, train_accuracies = train(model, train_loader, criterion, optimizer, epochs=epochs)
# 测试模型
print("\n开始测试...")
test_accuracy, test_loss, test_images, test_labels, test_preds = test(model, test_loader, criterion, target_chars)
# 绘制训练曲线
plot_training_curves(train_losses, train_accuracies, test_accuracy, epochs)
# 可视化预测结果
visualize_predictions(test_images, test_labels, test_preds, target_chars, num=6)
# 保存模型
torch.save(model.state_dict(), 'models/chinese_char_cnn.pth')
print("\n模型已保存为 models/chinese_char_cnn.pth")
print("可视化图像已保存到 plots 文件夹")
浙公网安备 33010602011771号