使用NVIDIA GPU进行高效深度学习训练
目录
案例程序
"""
PyTorch神经网络训练框架 - GPU加速与高级正则化
功能概述:
本程序提供了一个完整的神经网络训练管道,专为利用NVIDIA GPU进行高效深度学习训练而设计。程序实现了从数据准备、模型训练到评估预测的全流程,并集成了多种先进的正则化技术和训练优化策略。
主要特点:
1. GPU加速训练:自动检测并利用CUDA设备,支持混合精度训练,大幅提升训练速度
2. 高级正则化:结合Dropout、批归一化(BatchNorm)和权重衰减,有效防止过拟合
3. 智能训练调度:使用学习率动态调整和早停机制,优化训练过程
4. 全面可视化:实时监控训练损失、验证损失和学习率变化,提供四种分析图表
5. 模型评估:自动计算预测统计指标,评估模型泛化能力和过拟合程度
使用方法:
1. 确保已安装PyTorch和CUDA工具包(可选,用于GPU加速)
2. 直接运行程序,自动检测并使用可用硬件资源
3. 程序会自动生成模拟数据,训练神经网络并输出结果
4. 训练完成后,查看生成的图表和分析结果,模型将保存为.pth文件
评价:
本程序适合深度学习初学者和研究者使用,提供了工业级的训练框架和最佳实践。
通过合理的默认参数配置,即使不调整超参数也能获得良好结果。
程序结构清晰,注释完整,易于修改和扩展以适应特定任务需求。
硬件要求:
- 支持CUDA的NVIDIA GPU(可选,推荐用于最佳性能)
- 或标准CPU(兼容但速度较慢)
输出结果:
- 训练过程可视化图表
- 模型预测结果和统计信息
- 最佳模型和最终模型文件(.pth)
- 训练配置和结果的JSON文件
"""
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import matplotlib.pyplot as plt
import numpy as np
import time
import json
# 设置英文字体,避免中文问题
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.unicode_minus'] = True
# 创建一个正则化适中的神经网络模型
class RegularizedModel(nn.Module):
def __init__(self, input_size=10, hidden_size=256, output_size=1):
super(RegularizedModel, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.bn1 = nn.BatchNorm1d(hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size//2)
self.bn2 = nn.BatchNorm1d(hidden_size//2)
self.fc3 = nn.Linear(hidden_size//2, hidden_size//4)
self.bn3 = nn.BatchNorm1d(hidden_size//4)
self.fc4 = nn.Linear(hidden_size//4, output_size)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(0.3) # 降低dropout比率
def forward(self, x):
x = self.relu(self.bn1(self.fc1(x)))
x = self.dropout(x)
x = self.relu(self.bn2(self.fc2(x)))
x = self.dropout(x)
x = self.relu(self.bn3(self.fc3(x)))
x = self.fc4(x)
return x
# 检查CUDA是否可用
cuda_available = torch.cuda.is_available()
print(f"CUDA Available: {cuda_available}")
if cuda_available:
print(f"GPU Device: {torch.cuda.get_device_name(0)}")
print(f"CUDA Version: {torch.version.cuda}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
# 生成更多数据
x = torch.randn(15000, 10)
y = torch.randn(15000, 1)
# 分割数据集为训练集和验证集
dataset_size = len(x)
train_size = int(0.8 * dataset_size)
val_size = dataset_size - train_size
train_dataset, val_dataset = random_split(
list(zip(x, y)), [train_size, val_size]
)
# 解压缩数据集
train_x = torch.stack([item[0] for item in train_dataset])
train_y = torch.stack([item[1] for item in train_dataset])
val_x = torch.stack([item[0] for item in val_dataset])
val_y = torch.stack([item[1] for item in val_dataset])
# 创建数据加载器
train_loader = DataLoader(
list(zip(train_x, train_y)), batch_size=128, shuffle=True
)
val_loader = DataLoader(
list(zip(val_x, val_y)), batch_size=128, shuffle=False
)
# 创建模型
model = RegularizedModel()
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5) # 降低权重衰减
# 创建学习率调度器
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
optimizer, mode='min', factor=0.5, patience=15 # 增加耐心值
)
# 如果CUDA可用,将模型和数据移动到GPU
if cuda_available:
model = model.cuda()
train_x, train_y = train_x.cuda(), train_y.cuda()
val_x, val_y = val_x.cuda(), val_y.cuda()
print("Using GPU acceleration")
# 使用混合精度训练
scaler = torch.amp.GradScaler('cuda')
# 训练循环
train_losses = []
val_losses = []
learning_rates = []
start_time = time.time()
best_val_loss = float('inf')
best_model_state = None
patience_counter = 0
patience = 30 # 增加早停耐心值
for epoch in range(300):
# 训练阶段
model.train()
epoch_train_loss = 0
for batch_x, batch_y in train_loader:
batch_x, batch_y = batch_x.cuda(), batch_y.cuda()
optimizer.zero_grad()
# 使用自动混合精度
with torch.amp.autocast('cuda'):
output = model(batch_x)
loss = criterion(output, batch_y)
# 使用梯度缩放
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
epoch_train_loss += loss.item()
avg_train_loss = epoch_train_loss / len(train_loader)
train_losses.append(avg_train_loss)
# 验证阶段
model.eval()
epoch_val_loss = 0
with torch.no_grad():
for batch_x, batch_y in val_loader:
batch_x, batch_y = batch_x.cuda(), batch_y.cuda()
with torch.amp.autocast('cuda'):
output = model(batch_x)
loss = criterion(output, batch_y)
epoch_val_loss += loss.item()
avg_val_loss = epoch_val_loss / len(val_loader)
val_losses.append(avg_val_loss)
# 更新学习率
scheduler.step(avg_val_loss)
current_lr = optimizer.param_groups[0]['lr']
learning_rates.append(current_lr)
# 保存最佳模型
if avg_val_loss < best_val_loss:
best_val_loss = avg_val_loss
best_model_state = model.state_dict().copy()
torch.save(model.state_dict(), 'best_model.pth')
patience_counter = 0
else:
patience_counter += 1
# 早停检查
if patience_counter >= patience:
print(f"Early stopping triggered at epoch {epoch+1}")
break
if (epoch + 1) % 50 == 0:
print(f'Epoch [{epoch+1}/300], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, LR: {current_lr:.6f}')
training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")
print(f"Best validation loss: {best_val_loss:.4f}")
# 保存最终模型
torch.save(model.state_dict(), 'final_model.pth')
print("Final model saved as final_model.pth")
# 可视化训练过程
plt.figure(figsize=(15, 10))
# 绘制训练和验证损失曲线
plt.subplot(2, 2, 1)
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
# 绘制学习率变化
plt.subplot(2, 2, 2)
plt.plot(learning_rates)
plt.title('Learning Rate Schedule')
plt.xlabel('Epochs')
plt.ylabel('Learning Rate')
plt.grid(True)
plt.yscale('log')
# 绘制最后50个损失值
plt.subplot(2, 2, 3)
plt.plot(train_losses[-50:], label='Training Loss')
plt.plot(val_losses[-50:], label='Validation Loss')
plt.title('Last 50 Epochs Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
# 绘制损失与学习率的关系
plt.subplot(2, 2, 4)
plt.scatter(learning_rates, train_losses, alpha=0.5, label='Training Loss')
plt.scatter(learning_rates, val_losses, alpha=0.5, label='Validation Loss')
plt.title('Loss vs Learning Rate')
plt.xlabel('Learning Rate')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.xscale('log')
plt.tight_layout()
# 添加CUDA信息文本
info_text = f"CUDA Available: {cuda_available}"
info_text += f"\nGPU: {torch.cuda.get_device_name(0)}"
info_text += f"\nTraining Time: {training_time:.2f} seconds"
info_text += f"\nFinal Training Loss: {train_losses[-1]:.4f}"
info_text += f"\nFinal Validation Loss: {val_losses[-1]:.4f}"
info_text += f"\nBest Validation Loss: {best_val_loss:.4f}"
plt.figtext(0.5, 0.01, info_text, ha="center", fontsize=10,
bbox={"facecolor":"orange", "alpha":0.2, "pad":5})
plt.show()
# 加载最佳模型
if best_model_state is not None:
model.load_state_dict(best_model_state)
# 测试模型性能
model.eval()
with torch.no_grad():
test_x = torch.randn(20, 10)
test_x = test_x.cuda()
# 使用混合精度进行推理
with torch.amp.autocast('cuda'):
predictions = model(test_x)
print(f"\nModel Predictions (Best Model):")
for i, pred in enumerate(predictions):
print(f"Sample {i+1}: {pred.item():.4f}")
# 计算预测统计信息
predictions_cpu = predictions.cpu().numpy()
print(f"\nPrediction Statistics:")
print(f"Mean: {np.mean(predictions_cpu):.4f}")
print(f"Std: {np.std(predictions_cpu):.4f}")
print(f"Min: {np.min(predictions_cpu):.4f}")
print(f"Max: {np.max(predictions_cpu):.4f}")
# 创建测试集
test_x = torch.randn(1000, 10)
test_y = torch.randn(1000, 1)
test_dataset = TensorDataset(test_x, test_y)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)
# 评估模型
def evaluate_model(model, test_loader, criterion, device='cuda' if cuda_available else 'cpu'):
model.eval()
total_loss = 0
with torch.no_grad():
for batch_x, batch_y in test_loader:
if device == 'cuda':
batch_x, batch_y = batch_x.cuda(), batch_y.cuda()
with torch.amp.autocast('cuda' if device == 'cuda' else 'cpu'):
output = model(batch_x)
loss = criterion(output, batch_y)
total_loss += loss.item()
return total_loss / len(test_loader)
test_loss = evaluate_model(model, test_loader, criterion)
print(f"\nTest Loss: {test_loss:.4f}")
# 比较训练损失和测试损失
print(f"Training Loss: {train_losses[-1]:.4f}")
print(f"Validation Loss: {val_losses[-1]:.4f}")
print(f"Test Loss: {test_loss:.4f}")
print(f"Overfitting Degree: {abs(test_loss - train_losses[-1]):.4f}")
# 保存模型架构信息
model_info = {
'input_size': 10,
'hidden_size': 256,
'output_size': 1,
'num_layers': 4,
'activation': 'ReLU',
'dropout_rate': 0.3,
'optimizer': 'AdamW',
'learning_rate': 0.001,
'weight_decay': 1e-5,
'scheduler': 'ReduceLROnPlateau',
'batch_size': 128,
'num_epochs': epoch + 1,
'final_train_loss': train_losses[-1],
'final_val_loss': val_losses[-1],
'best_val_loss': best_val_loss,
'test_loss': test_loss,
'overfitting_degree': abs(test_loss - train_losses[-1])
}
# 保存模型信息
with open('model_info.json', 'w') as f:
json.dump(model_info, f, indent=4)
print("Model info saved as model_info.json")
核心步骤
1. 设备检测与设置
import torch
# 检测GPU可用性
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")
# 如果有多个GPU,可以选择特定设备
if torch.cuda.device_count() > 1:
print(f"发现 {torch.cuda.device_count()} 个GPU")
# 可以选择特定GPU
device = torch.device('cuda:0') # 使用第一个GPU
- 模型转移到GPU
import torch.nn as nn
# 创建模型
model = YourModelClass()
# 将模型转移到GPU
model = model.to(device)
# 如果有多个GPU,可以使用数据并行
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model)
print("使用多GPU并行训练")
- 数据转移到GPU
# 对于单个数据样本
data = data.to(device)
target = target.to(device)
# 在数据加载循环中
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
# 训练代码...
高效训练技巧
1. 使用混合精度训练
from torch.cuda import amp
# 创建梯度缩放器
scaler = amp.GradScaler()
# 训练循环
for data, target in train_loader:
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
# 前向传播(混合精度)
with amp.autocast():
output = model(data)
loss = criterion(output, target)
# 反向传播(梯度缩放)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
2. 优化数据加载
from torch.utils.data import DataLoader
# 使用多进程数据加载
train_loader = DataLoader(
dataset,
batch_size=64,
shuffle=True,
num_workers=4, # 使用4个工作进程
pin_memory=True # 锁页内存,加速CPU到GPU的数据传输
)
3. 使用CUDA优化器
# 选择适合GPU的优化器
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# 或者使用 fused Adam 优化器(需要CUDA)
try:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, fused=True)
except:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
完整训练示例
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda import amp
import time
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=25):
# 设备设置
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
# 混合精度训练
scaler = amp.GradScaler()
# 记录训练历史
history = {'train_loss': [], 'val_loss': []}
for epoch in range(num_epochs):
# 训练阶段
model.train()
running_loss = 0.0
start_time = time.time()
for inputs, labels in train_loader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
# 混合精度前向传播
with amp.autocast():
outputs = model(inputs)
loss = criterion(outputs, labels)
# 梯度缩放和反向传播
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
running_loss += loss.item() * inputs.size(0)
# 验证阶段
model.eval()
val_loss = 0.0
with torch.no_grad():
for inputs, labels in val_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
val_loss += loss.item() * inputs.size(0)
# 计算平均损失
epoch_train_loss = running_loss / len(train_loader.dataset)
epoch_val_loss = val_loss / len(val_loader.dataset)
history['train_loss'].append(epoch_train_loss)
history['val_loss'].append(epoch_val_loss)
epoch_time = time.time() - start_time
print(f'Epoch {epoch+1}/{num_epochs} | '
f'Train Loss: {epoch_train_loss:.4f} | '
f'Val Loss: {epoch_val_loss:.4f} | '
f'Time: {epoch_time:.2f}s')
return history, model
性能监控与优化
1. GPU内存监控
# 监控GPU内存使用情况
def print_gpu_memory():
if torch.cuda.is_available():
for i in range(torch.cuda.device_count()):
allocated = torch.cuda.memory_allocated(i) / 1024**3
cached = torch.cuda.memory_reserved(i) / 1024**3
print(f'GPU {i}: 已分配 {allocated:.2f}GB, 缓存 {cached:.2f}GB')
# 清空GPU缓存
torch.cuda.empty_cache()
2. 基准测试
def benchmark_training(model, dataloader, num_iterations=100):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model.train()
starter = torch.cuda.Event(enable_timing=True)
ender = torch.cuda.Event(enable_timing=True)
timings = []
for i, (inputs, labels) in enumerate(dataloader):
if i >= num_iterations:
break
inputs, labels = inputs.to(device), labels.to(device)
starter.record()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
ender.record()
torch.cuda.synchronize()
timings.append(starter.elapsed_time(ender))
print(f'平均训练时间: {sum(timings)/len(timings):.2f}ms')
return timings
常见问题解决
1. GPU内存不足
# 减少批量大小
train_loader = DataLoader(dataset, batch_size=32) # 从64减少到32
# 使用梯度累积
accumulation_steps = 4
for i, (inputs, labels) in enumerate(train_loader):
inputs, labels = inputs.to(device), labels.to(device)
with amp.autocast():
outputs = model(inputs)
loss = criterion(outputs, labels) / accumulation_steps
scaler.scale(loss).backward()
if (i + 1) % accumulation_steps == 0:
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
2. 多GPU训练
# 使用 DistributedDataParallel 进行多GPU训练
import torch.distributed as dist
import torch.multiprocessing as mp
def setup(rank, world_size):
dist.init_process_group("nccl", rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
def cleanup():
dist.destroy_process_group()
def train_distributed(rank, world_size):
setup(rank, world_size)
# 创建模型并转移到GPU
model = YourModel().to(rank)
model = nn.parallel.DistributedDataParallel(model, device_ids=[rank])
# 训练代码...
cleanup()
# 启动多进程训练
if __name__ == "__main__":
world_size = torch.cuda.device_count()
mp.spawn(train_distributed, args=(world_size,), nprocs=world_size, join=True)
最佳实践总结
- 始终检查GPU可用性:使用 torch.cuda.is_available() 进行检测
- 使用 .to(device) 方法:统一管理模型和数据的位置
- 启用混合精度训练:使用 amp.autocast() 和 GradScaler
- 优化数据加载:使用 num_workers 和 pin_memory=True
- 监控GPU内存:定期检查内存使用情况,避免内存泄漏
- 使用适当的批量大小:在内存允许的情况下使用最大批量大小
- 定期清空缓存:使用 torch.cuda.empty_cache()
浙公网安备 33010602011771号