完整教程:Python训练营---Day45
DAY 45 Tensorboard采用介绍
知识点回顾:
- tensorboard的发展历史和原理
- tensorboard的常见执行
- tensorboard在cifar上的实战:MLP和CNN模型
效果展示如下,很适合拿去组会汇报撑页数:
作业:对resnet18在cifar10上采用微调策略下,用tensorboard监控训练过程。
import torchimport torch.nn as nnimport torch.optim as optimfrom torchvision import datasets, transforms, modelsfrom torch.utils.data import DataLoaderimport matplotlib.pyplot as pltimport os # 设置中文字体支持plt.rcParams["font.family"] = ["SimHei"]plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题 # 检查GPU是否可用device = torch.device("cuda" if torch.cuda.is_available() else "cpu")print(f"使用设备: {device}") # 1. 数据预处理(训练集增强,测试集标准化)train_transform = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1), transforms.RandomRotation(15), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))]) test_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))]) # 2. 加载CIFAR-10数据集train_dataset = datasets.CIFAR10( root='./data', train=True, download=True, transform=train_transform) test_dataset = datasets.CIFAR10( root='./data', train=False, transform=test_transform) # 3. 创建数据加载器batch_size = 64train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) # 4. 定义ResNet18模型def create_resnet18(pretrained=True, num_classes=10): model = models.resnet18(pretrained=pretrained) # 修改最后一层全连接层 in_features = model.fc.in_features model.fc = nn.Linear(in_features, num_classes) return model.to(device) # 5. 冻结/解冻模型层的函数# 这种设计允许我们在迁移学习中保留预训练模型的特征提取部分(卷积层),只训练新添加的分类层(全连接层)。def freeze_model(model, freeze=True): """冻结或解冻模型的卷积层参数""" # 冻结/解冻除fc层外的所有参数 for name, param in model.named_parameters(): if 'fc' not in name: #排除名称中包含 "fc" 的参数,这些通常是全连接层的参数 param.requires_grad = not freeze #param.requires_grad是 PyTorch 中控制参数是否参与反向传播和梯度更新的标志 # 打印冻结状态 frozen_params = sum(p.numel() for p in model.parameters() if not p.requires_grad) #统计所有requires_grad=False的参数数量 total_params = sum(p.numel() for p in model.parameters()) if freeze: print(f"已冻结模型卷积层参数 ({frozen_params}/{total_params} 参数)") else: print(f"已解冻模型所有参数 ({total_params}/{total_params} 参数可训练)") return model # 6. 训练函数(整合 TensorBoard 记录)def train_with_freeze_schedule(model, train_loader, test_loader, criterion, optimizer, scheduler, device, epochs, freeze_epochs=5): # ======================== TensorBoard 核心配置 ======================== # 在使用tensorboard前需要先指定日志保存路径 log_dir = "runs/cifar10_resnet18_exp" # 指定日志保存路径 if os.path.exists(log_dir): #检查刚才定义的路径是否存在 version = 1 while os.path.exists(f"{log_dir}_v{version}"): # 如果路径存在且版本号一致 version += 1 # 版本号加1 log_dir = f"{log_dir}_v{version}" # 如果路径存在,则创建一个新版本 writer = SummaryWriter(log_dir) # 初始化SummaryWriter print("开始使用ResNet18训练模型...") print(f"TensorBoard 日志目录: {log_dir}") # 所以第一次是cifar10_resnet_exp、第二次是cifar10_resnet_exp_v1 print("训练后执行: tensorboard --logdir=runs 查看可视化") # (可选)记录模型结构:用一个真实样本走一遍前向传播,让 TensorBoard 解析计算图 dataiter = iter(train_loader) images, labels = next(dataiter) images = images.to(device) writer.add_graph(model, images) # 写入模型结构到 TensorBoard # (可选)记录原始训练图像:可视化数据增强前/后效果 img_grid = torchvision.utils.make_grid(images[:8].cpu()) # 取前8张 writer.add_image('原始训练图像(增强前)', img_grid, global_step=0) global_step = 0 # 全局步骤,用于 TensorBoard 标量记录 """ 前freeze_epochs轮冻结卷积层,之后解冻所有层进行训练 """ # 初始冻结卷积层 if freeze_epochs > 0: model = freeze_model(model, freeze=True) for epoch in range(epochs): # 解冻控制:在指定轮次后解冻所有层 if epoch == freeze_epochs: model = freeze_model(model, freeze=False) # 解冻后调整优化器(可选) optimizer.param_groups[0]['lr'] = 1e-4 # 降低学习率防止过拟合 model.train() # 设置为训练模式 running_loss = 0.0 correct_train = 0 total_train = 0 for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() optimizer.step() # 统计准确率 running_loss += iter_loss _, predicted = output.max(1) total_train += target.size(0) correct_train += predicted.eq(target).sum().item() # ======================== TensorBoard 标量记录 ======================== # 记录每个 batch 的损失、准确率和学习率 batch_acc = 100. * correct_train / total_train writer.add_scalar('Train/Batch Loss', iter_loss, global_step) writer.add_scalar('Train/Batch Accuracy', batch_acc, global_step) writer.add_scalar('Train/Learning Rate', optimizer.param_groups[0]['lr'], global_step) # 每 200 个 batch 记录一次参数直方图(可选,耗时稍高) if (batch_idx + 1) % 200 == 0: for name, param in model.named_parameters(): writer.add_histogram(f'Weights/{name}', param, global_step) if param.grad is not None: writer.add_histogram(f'Gradients/{name}', param.grad, global_step) global_step += 1 # 全局步骤递增 # 计算 epoch 级训练指标 epoch_train_loss = running_loss / len(train_loader) epoch_train_acc = 100. * correct_train / total_train # ======================== TensorBoard epoch 标量记录 ======================== writer.add_scalar('Train/Epoch Loss', epoch_train_loss, epoch) writer.add_scalar('Train/Epoch Accuracy', epoch_train_acc, epoch) # 测试阶段 model.eval() correct_test = 0 total_test = 0 test_loss = 0.0 wrong_images = [] # 存储错误预测样本(用于可视化) wrong_labels = [] wrong_preds = [] with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) test_loss += criterion(output, target).item() _, predicted = output.max(1) total_test += target.size(0) correct_test += predicted.eq(target).sum().item() # 收集错误预测样本(用于可视化) wrong_mask = (predicted != target) if wrong_mask.sum() > 0: wrong_batch_images = data[wrong_mask][:8].cpu() # 最多存8张 wrong_batch_labels = target[wrong_mask][:8].cpu() wrong_batch_preds = predicted[wrong_mask][:8].cpu() wrong_images.extend(wrong_batch_images) wrong_labels.extend(wrong_batch_labels) wrong_preds.extend(wrong_batch_preds) # 计算 epoch 级测试指标 epoch_test_loss = test_loss / len(test_loader) epoch_test_acc = 100. * correct_test / total_test # ======================== TensorBoard 测试集记录 ======================== writer.add_scalar('Test/Epoch Loss', epoch_test_loss, epoch) writer.add_scalar('Test/Epoch Accuracy', epoch_test_acc, epoch) # (可选)可视化错误预测样本 if wrong_images: wrong_img_grid = torchvision.utils.make_grid(wrong_images) writer.add_image('错误预测样本', wrong_img_grid, epoch) # 写入错误标签文本(可选) wrong_text = [f"真实: {classes[wl]}, 预测: {classes[wp]}" for wl, wp in zip(wrong_labels, wrong_preds)] writer.add_text('错误预测标签', '\n'.join(wrong_text), epoch) # 记录历史数据 train_loss_history.append(epoch_train_loss) test_loss_history.append(epoch_test_loss) train_acc_history.append(epoch_train_acc) test_acc_history.append(epoch_test_acc) # 更新学习率调度器 if scheduler is not None: scheduler.step(epoch_test_loss) # 打印 epoch 结果 print(f"Epoch {epoch+1} 完成 | 训练损失: {epoch_train_loss:.4f} " f"| 训练准确率: {epoch_train_acc:.2f}% | 测试准确率: {epoch_test_acc:.2f}%") # 关闭 TensorBoard 写入器 writer.close() return epoch_test_acc # 返回最终测试准确率 # 主函数:训练模型def main(): # 参数设置 epochs = 40 # 总训练轮次 freeze_epochs = 5 # 冻结卷积层的轮次 learning_rate = 1e-3 # 初始学习率 weight_decay = 1e-4 # 权重衰减 # 创建ResNet18模型(加载预训练权重) model = create_resnet18(pretrained=True, num_classes=10) # 定义优化器和损失函数 optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) criterion = nn.CrossEntropyLoss() # 定义学习率调度器 scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=0.5, patience=2, verbose=True ) # 开始训练(前5轮冻结卷积层,之后解冻) final_accuracy = train_with_freeze_schedule( model=model, train_loader=train_loader, test_loader=test_loader, criterion=criterion, optimizer=optimizer, scheduler=scheduler, device=device, epochs=epochs, freeze_epochs=freeze_epochs ) print(f"训练完成!最终测试准确率: {final_accuracy:.2f}%") # # 保存模型 # torch.save(model.state_dict(), 'resnet18_cifar10_finetuned.pth') # print("模型已保存至: resnet18_cifar10_finetuned.pth") if __name__ == "__main__": main()