完整教程:Python训练营---Day45

DAY 45 Tensorboard采用介绍

知识点回顾:

  1. tensorboard的发展历史和原理
  2. tensorboard的常见执行
  3. tensorboard在cifar上的实战:MLP和CNN模型

效果展示如下,很适合拿去组会汇报撑页数:

作业:对resnet18在cifar10上采用微调策略下,用tensorboard监控训练过程。

import torchimport torch.nn as nnimport torch.optim as optimfrom torchvision import datasets, transforms, modelsfrom torch.utils.data import DataLoaderimport matplotlib.pyplot as pltimport os # 设置中文字体支持plt.rcParams["font.family"] = ["SimHei"]plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题 # 检查GPU是否可用device = torch.device("cuda" if torch.cuda.is_available() else "cpu")print(f"使用设备: {device}") # 1. 数据预处理(训练集增强,测试集标准化)train_transform = transforms.Compose([    transforms.RandomCrop(32, padding=4),    transforms.RandomHorizontalFlip(),    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),    transforms.RandomRotation(15),    transforms.ToTensor(),    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))]) test_transform = transforms.Compose([    transforms.ToTensor(),    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))]) # 2. 加载CIFAR-10数据集train_dataset = datasets.CIFAR10(    root='./data',    train=True,    download=True,    transform=train_transform) test_dataset = datasets.CIFAR10(    root='./data',    train=False,    transform=test_transform) # 3. 创建数据加载器batch_size = 64train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) # 4. 定义ResNet18模型def create_resnet18(pretrained=True, num_classes=10):    model = models.resnet18(pretrained=pretrained)        # 修改最后一层全连接层    in_features = model.fc.in_features    model.fc = nn.Linear(in_features, num_classes)        return model.to(device) # 5. 冻结/解冻模型层的函数# 这种设计允许我们在迁移学习中保留预训练模型的特征提取部分(卷积层),只训练新添加的分类层(全连接层)。def freeze_model(model, freeze=True):    """冻结或解冻模型的卷积层参数"""    # 冻结/解冻除fc层外的所有参数    for name, param in model.named_parameters():        if 'fc' not in name:    #排除名称中包含 "fc" 的参数,这些通常是全连接层的参数            param.requires_grad = not freeze    #param.requires_grad是 PyTorch 中控制参数是否参与反向传播和梯度更新的标志        # 打印冻结状态    frozen_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)   #统计所有requires_grad=False的参数数量    total_params = sum(p.numel() for p in model.parameters())        if freeze:        print(f"已冻结模型卷积层参数 ({frozen_params}/{total_params} 参数)")    else:        print(f"已解冻模型所有参数 ({total_params}/{total_params} 参数可训练)")        return model # 6. 训练函数(整合 TensorBoard 记录)def train_with_freeze_schedule(model, train_loader, test_loader, criterion, optimizer, scheduler, device, epochs, freeze_epochs=5):    # ======================== TensorBoard 核心配置 ========================    # 在使用tensorboard前需要先指定日志保存路径    log_dir = "runs/cifar10_resnet18_exp" # 指定日志保存路径    if os.path.exists(log_dir): #检查刚才定义的路径是否存在        version = 1         while os.path.exists(f"{log_dir}_v{version}"): # 如果路径存在且版本号一致            version += 1 # 版本号加1        log_dir = f"{log_dir}_v{version}" # 如果路径存在,则创建一个新版本    writer = SummaryWriter(log_dir) # 初始化SummaryWriter    print("开始使用ResNet18训练模型...")    print(f"TensorBoard 日志目录: {log_dir}") # 所以第一次是cifar10_resnet_exp、第二次是cifar10_resnet_exp_v1    print("训练后执行: tensorboard --logdir=runs 查看可视化")     # (可选)记录模型结构:用一个真实样本走一遍前向传播,让 TensorBoard 解析计算图    dataiter = iter(train_loader)    images, labels = next(dataiter)    images = images.to(device)    writer.add_graph(model, images)  # 写入模型结构到 TensorBoard     # (可选)记录原始训练图像:可视化数据增强前/后效果    img_grid = torchvision.utils.make_grid(images[:8].cpu())  # 取前8张    writer.add_image('原始训练图像(增强前)', img_grid, global_step=0)     global_step = 0       # 全局步骤,用于 TensorBoard 标量记录        """    前freeze_epochs轮冻结卷积层,之后解冻所有层进行训练    """    # 初始冻结卷积层    if freeze_epochs > 0:        model = freeze_model(model, freeze=True)        for epoch in range(epochs):        # 解冻控制:在指定轮次后解冻所有层        if epoch == freeze_epochs:            model = freeze_model(model, freeze=False)            # 解冻后调整优化器(可选)            optimizer.param_groups[0]['lr'] = 1e-4  # 降低学习率防止过拟合                model.train()  # 设置为训练模式        running_loss = 0.0        correct_train = 0        total_train = 0                for batch_idx, (data, target) in enumerate(train_loader):            data, target = data.to(device), target.to(device)             optimizer.zero_grad()            output = model(data)            loss = criterion(output, target)            loss.backward()            optimizer.step()                       # 统计准确率            running_loss += iter_loss            _, predicted = output.max(1)            total_train += target.size(0)            correct_train += predicted.eq(target).sum().item()                        # ======================== TensorBoard 标量记录 ========================            # 记录每个 batch 的损失、准确率和学习率            batch_acc = 100. * correct_train / total_train            writer.add_scalar('Train/Batch Loss', iter_loss, global_step)            writer.add_scalar('Train/Batch Accuracy', batch_acc, global_step)            writer.add_scalar('Train/Learning Rate', optimizer.param_groups[0]['lr'], global_step)             # 每 200 个 batch 记录一次参数直方图(可选,耗时稍高)            if (batch_idx + 1) % 200 == 0:                for name, param in model.named_parameters():                    writer.add_histogram(f'Weights/{name}', param, global_step)                    if param.grad is not None:                        writer.add_histogram(f'Gradients/{name}', param.grad, global_step)             global_step += 1  # 全局步骤递增         # 计算 epoch 级训练指标        epoch_train_loss = running_loss / len(train_loader)        epoch_train_acc = 100. * correct_train / total_train        # ======================== TensorBoard  epoch 标量记录 ========================        writer.add_scalar('Train/Epoch Loss', epoch_train_loss, epoch)        writer.add_scalar('Train/Epoch Accuracy', epoch_train_acc, epoch)                 # 测试阶段        model.eval()        correct_test = 0        total_test = 0        test_loss = 0.0        wrong_images = []  # 存储错误预测样本(用于可视化)        wrong_labels = []        wrong_preds = []         with torch.no_grad():            for data, target in test_loader:                data, target = data.to(device), target.to(device)                output = model(data)                test_loss += criterion(output, target).item()                _, predicted = output.max(1)                total_test += target.size(0)                correct_test += predicted.eq(target).sum().item()                 # 收集错误预测样本(用于可视化)                wrong_mask = (predicted != target)                if wrong_mask.sum() > 0:                    wrong_batch_images = data[wrong_mask][:8].cpu()  # 最多存8张                    wrong_batch_labels = target[wrong_mask][:8].cpu()                    wrong_batch_preds = predicted[wrong_mask][:8].cpu()                    wrong_images.extend(wrong_batch_images)                    wrong_labels.extend(wrong_batch_labels)                    wrong_preds.extend(wrong_batch_preds)                # 计算 epoch 级测试指标        epoch_test_loss = test_loss / len(test_loader)        epoch_test_acc = 100. * correct_test / total_test         # ======================== TensorBoard 测试集记录 ========================        writer.add_scalar('Test/Epoch Loss', epoch_test_loss, epoch)        writer.add_scalar('Test/Epoch Accuracy', epoch_test_acc, epoch)         # (可选)可视化错误预测样本        if wrong_images:            wrong_img_grid = torchvision.utils.make_grid(wrong_images)            writer.add_image('错误预测样本', wrong_img_grid, epoch)            # 写入错误标签文本(可选)            wrong_text = [f"真实: {classes[wl]}, 预测: {classes[wp]}"                          for wl, wp in zip(wrong_labels, wrong_preds)]            writer.add_text('错误预测标签', '\n'.join(wrong_text), epoch)                # 记录历史数据        train_loss_history.append(epoch_train_loss)        test_loss_history.append(epoch_test_loss)        train_acc_history.append(epoch_train_acc)        test_acc_history.append(epoch_test_acc)                # 更新学习率调度器        if scheduler is not None:            scheduler.step(epoch_test_loss)                # 打印 epoch 结果        print(f"Epoch {epoch+1} 完成 | 训练损失: {epoch_train_loss:.4f} "              f"| 训练准确率: {epoch_train_acc:.2f}% | 测试准确率: {epoch_test_acc:.2f}%")        # 关闭 TensorBoard 写入器    writer.close()    return epoch_test_acc  # 返回最终测试准确率 # 主函数:训练模型def main():    # 参数设置    epochs = 40  # 总训练轮次    freeze_epochs = 5  # 冻结卷积层的轮次    learning_rate = 1e-3  # 初始学习率    weight_decay = 1e-4  # 权重衰减        # 创建ResNet18模型(加载预训练权重)    model = create_resnet18(pretrained=True, num_classes=10)        # 定义优化器和损失函数    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)    criterion = nn.CrossEntropyLoss()        # 定义学习率调度器    scheduler = optim.lr_scheduler.ReduceLROnPlateau(        optimizer, mode='min', factor=0.5, patience=2, verbose=True    )        # 开始训练(前5轮冻结卷积层,之后解冻)    final_accuracy = train_with_freeze_schedule(        model=model,        train_loader=train_loader,        test_loader=test_loader,        criterion=criterion,        optimizer=optimizer,        scheduler=scheduler,        device=device,        epochs=epochs,        freeze_epochs=freeze_epochs    )        print(f"训练完成!最终测试准确率: {final_accuracy:.2f}%")        # # 保存模型    # torch.save(model.state_dict(), 'resnet18_cifar10_finetuned.pth')    # print("模型已保存至: resnet18_cifar10_finetuned.pth") if __name__ == "__main__":    main()

posted @ 2025-07-23 20:44  yfceshi  阅读(6)  评论(0)    收藏  举报