手写汉字识别
import os
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image, ImageDraw, ImageFont
环境配置
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
device = torch.device('cpu') # 可改为'cuda'使用GPU
常量定义
CHARS = ['一', '二', '三', '十', '人', '口', '手', '日', '月', '水'] # 待识别的汉字
TRAIN_NUM = 200 # 每个汉字的训练样本数
TEST_NUM = 50 # 每个汉字的测试样本数
IMG_SIZE = 64 # 图像尺寸
DATA_SAVE_DIR = 'hanzi_data' # 数据集保存目录
BATCH_SIZE = 32 # 批处理大小
EPOCHS = 30 # 训练轮次
LEARNING_RATE = 0.005 # 学习率
class HanziDatasetGenerator:
"""汉字数据集生成器"""
def __init__(self):
# 初始化字体(尝试系统字体,失败则用默认字体)
self.font = ImageFont.load_default()
print("提示:使用默认字体生成汉字(若系统有中文字体会自动使用)")
def _generate_single_img(self, char):
"""生成单个汉字图像"""
# 创建白底图像
img = Image.new('L', (IMG_SIZE, IMG_SIZE), color=255)
draw = ImageDraw.Draw(img)
# 每个汉字的位置偏移(确保完整显示)
char_offsets = {
'一': (5, 25), '二': (5, 15), '三': (5, 10),
'十': (20, 15), '人': (10, 20), '口': (15, 15),
'手': (5, 10), '日': (15, 15), '月': (10, 15), '水': (5, 10)
}
x, y = char_offsets[char]
# 尝试使用系统中文字体,失败则用默认字体增强显示
try:
font = ImageFont.truetype('simsun.ttc', size=40) # 尝试宋体
draw.text((x, y), char, font=font, fill=0, stroke_width=2)
except:
# 默认字体增强显示(避免笔画过细)
draw.text((x, y), char, font=self.font, fill=0, stroke_width=3)
draw.text((x + 1, y), char, font=self.font, fill=0, stroke_width=2)
# 随机轻微旋转增加样本多样性
rotation = random.randint(-10, 10)
img = img.rotate(rotation, expand=False, fillcolor=255)
return img
def generate_dataset(self):
"""生成训练集和测试集"""
# 清除旧数据集
if os.path.exists(DATA_SAVE_DIR):
for root, dirs, files in os.walk(DATA_SAVE_DIR, topdown=False):
for f in files:
os.remove(os.path.join(root, f))
for d in dirs:
os.rmdir(os.path.join(root, d))
os.rmdir(DATA_SAVE_DIR)
# 创建目录结构
for split in ['train', 'test']:
for char in CHARS:
os.makedirs(os.path.join(DATA_SAVE_DIR, split, char), exist_ok=True)
# 生成样本
print("正在生成数据集...")
for char in CHARS:
# 生成训练样本
for i in range(TRAIN_NUM):
img = self._generate_single_img(char)
img.save(os.path.join(DATA_SAVE_DIR, 'train', char, f'{i}.png'))
# 生成测试样本
for i in range(TEST_NUM):
img = self._generate_single_img(char)
img.save(os.path.join(DATA_SAVE_DIR, 'test', char, f'{i}.png'))
print(f"数据集生成完成,保存路径:{os.path.abspath(DATA_SAVE_DIR)}")
class HanziDataset(Dataset):
"""汉字数据集加载器"""
def __init__(self, split='train'):
self.split = split
self.data_dir = os.path.join(DATA_SAVE_DIR, split)
self.char_list = CHARS
self.char2idx = {c: i for i, c in enumerate(self.char_list)} # 汉字到索引的映射
self.images, self.labels = self._load_data()
self.transform = transforms.ToTensor() # 图像转张量
def _load_data(self):
"""加载图像路径和对应标签"""
images = []
labels = []
for char in self.char_list:
char_dir = os.path.join(self.data_dir, char)
for img_name in os.listdir(char_dir):
images.append(os.path.join(char_dir, img_name))
labels.append(self.char2idx[char])
return images, labels
def __len__(self):
"""返回样本数量"""
return len(self.images)
def __getitem__(self, idx):
"""获取单个样本(图像张量+标签)"""
img = Image.open(self.images[idx]).convert('L') # 转为灰度图
return self.transform(img), self.labels[idx]
class FeatureCNN(nn.Module):
"""用于汉字识别的卷积神经网络"""
def __init__(self, num_classes=10):
super(FeatureCNN, self).__init__()
# 特征提取部分
self.features = nn.Sequential(
nn.Conv2d(1, 8, kernel_size=3, padding=1), # 输入1通道,输出8通道
nn.ReLU(),
nn.MaxPool2d(2, 2), # 64x64 → 32x32
nn.Conv2d(8, 16, kernel_size=3, padding=1), # 输入8通道,输出16通道
nn.ReLU(),
nn.MaxPool2d(2, 2) # 32x32 → 16x16
)
# 分类部分
self.classifier = nn.Linear(16 * 16 * 16, num_classes) # 16通道×16×16特征图
def forward(self, x):
"""前向传播"""
x = self.features(x)
x = x.view(-1, 16 * 16 * 16) # 展平特征图
x = self.classifier(x)
return x
def main():
# 1. 生成数据集
generator = HanziDatasetGenerator()
generator.generate_dataset()
# 2. 加载数据集
train_dataset = HanziDataset('train')
test_dataset = HanziDataset('test')
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
# 3. 初始化模型、损失函数和优化器
model = FeatureCNN(num_classes=len(CHARS)).to(device)
criterion = nn.CrossEntropyLoss() # 交叉熵损失(适用于分类)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) # Adam优化器
# 4. 模型训练
print("\n开始训练模型...")
best_acc = 0.0 # 记录最佳准确率
for epoch in range(EPOCHS):
model.train() # 训练模式
total_loss = 0.0
# 训练批次
for imgs, labels in train_loader:
imgs, labels = imgs.to(device), labels.to(device)
# 前向传播
optimizer.zero_grad() # 清空梯度
outputs = model(imgs)
loss = criterion(outputs, labels)
# 反向传播与优化
loss.backward()
optimizer.step()
total_loss += loss.item() * imgs.size(0) # 累计损失
# 计算平均损失
avg_loss = total_loss / len(train_dataset)
# 测试模型
model.eval() # 评估模式
correct = 0
total = 0
with torch.no_grad(): # 不计算梯度
for imgs, labels in test_loader:
imgs, labels = imgs.to(device), labels.to(device)
outputs = model(imgs)
_, preds = torch.max(outputs, 1) # 获取预测结果
total += labels.size(0)
correct += (preds == labels).sum().item()
# 计算准确率
acc = 100 * correct / total
print(f"轮次 {epoch + 1:2d} | 训练损失: {avg_loss:.4f} | 测试准确率: {acc:.2f}%")
# 保存最佳模型
if acc > best_acc:
best_acc = acc
torch.save(model.state_dict(), 'best_model.pth')
# 提前停止条件(准确率达标)
if acc >= 85:
print(f"准确率达标({acc:.2f}%),提前结束训练")
break
# 5. 加载最佳模型
model.load_state_dict(torch.load('best_model.pth'))
print(f"\n训练完成,最佳测试准确率: {best_acc:.2f}%")
# 6. 交互式识别
while True:
path = input("\n请输入图片路径(输入q退出): ")
if path.lower() == 'q':
break
if not os.path.exists(path):
print("图片路径不存在,请重新输入")
continue
try:
# 预处理图像
img = Image.open(path).convert('L').resize((IMG_SIZE, IMG_SIZE)) # 转为灰度图并调整尺寸
img_tensor = transforms.ToTensor()(img).unsqueeze(0).to(device) # 转为张量并增加批次维度
# 预测
with torch.no_grad():
output = model(img_tensor)
pred_idx = torch.argmax(output).item()
pred_char = CHARS[pred_idx]
confidence = torch.softmax(output, dim=1).max().item() * 100 # 计算可信度
print(f"识别结果: {pred_char} | 可信度: {confidence:.2f}%")
except Exception as e:
print(f"识别出错: {str(e)}")
if name == "main":
main()

浙公网安备 33010602011771号