:PyTorch GPU性能测试脚本

import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
import time
import numpy as np
import os

# 设置随机种子确保结果可复现
torch.manual_seed(42)
np.random.seed(42)


# 定义一个简单的CNN模型
class SimpleCNN(nn.Module):
def __init__(self):
super(SimpleCNN, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
self.pool = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(256 * 4 * 4, 512)
self.fc2 = nn.Linear(512, 10)
self.relu = nn.ReLU()
self.flatten = nn.Flatten()

def forward(self, x):
x = self.pool(self.relu(self.conv1(x)))
x = self.pool(self.relu(self.conv2(x)))
x = self.pool(self.relu(self.conv3(x)))
x = self.flatten(x)
x = self.relu(self.fc1(x))
x = self.fc2(x)
return x


def train_model(device, use_amp=False, batch_size=256, epochs=5):
"""训练模型并返回每轮平均训练时间"""
# 创建随机数据模拟训练过程
train_data = torch.randn(batch_size * 10, 3, 32, 32)
train_labels = torch.randint(0, 10, (batch_size * 10,))
train_dataset = torch.utils.data.TensorDataset(train_data, train_labels)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)

model = SimpleCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scaler = GradScaler(enabled=use_amp)

times = []
for epoch in range(epochs):
start_time = time.time()
model.train()
for inputs, labels in train_loader:
inputs, labels = inputs.to(device), labels.to(device)

# 混合精度训练
with autocast(enabled=use_amp):
outputs = model(inputs)
loss = criterion(outputs, labels)

optimizer.zero_grad()
if use_amp:
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
else:
loss.backward()
optimizer.step()

epoch_time = time.time() - start_time
times.append(epoch_time)
print(f"设备: {device}, 混合精度: {use_amp}, 轮次 {epoch + 1}/{epochs}, 耗时: {epoch_time:.2f}秒")

return np.mean(times)


def print_gpu_info():
"""打印GPU信息"""
if torch.cuda.is_available():
gpu_name = torch.cuda.get_device_name(0)
gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024 ** 3
print(f"检测到GPU: {gpu_name}")
print(f"GPU内存: {gpu_memory:.2f} GB")
print(f"CUDA版本: {torch.version.cuda}")
print(f"cuDNN版本: {torch.backends.cudnn.version()}")
else:
print("未检测到GPU,将只在CPU上运行测试")


def main():
print("===== PyTorch RTX 4070 性能测试 =====")
print_gpu_info()

# 确保GPU预热
if torch.cuda.is_available():
print("\n=== GPU预热 ===")
_ = train_model(torch.device("cuda"), use_amp=False, batch_size=32, epochs=1)
torch.cuda.empty_cache()

# 测试不同配置
results = {}

print("\n=== CPU 基准测试 ===")
cpu_time = train_model(torch.device("cpu"), use_amp=False, batch_size=32, epochs=3)
results["CPU"] = cpu_time

if torch.cuda.is_available():
print("\n=== GPU FP32 测试 ===")
gpu_fp32_time = train_model(torch.device("cuda"), use_amp=False, epochs=3)
results["GPU FP32"] = gpu_fp32_time

print("\n=== GPU FP16 (混合精度) 测试 ===")
gpu_fp16_time = train_model(torch.device("cuda"), use_amp=True, epochs=3)
results["GPU FP16"] = gpu_fp16_time

# 计算加速比
fp32_speedup = cpu_time / gpu_fp32_time
fp16_speedup = cpu_time / gpu_fp16_time
mixed_vs_full = gpu_fp32_time / gpu_fp16_time

print("\n===== 性能对比 =====")
print(f"CPU 平均每轮耗时: {cpu_time:.2f}秒")
print(f"GPU FP32 平均每轮耗时: {gpu_fp32_time:.2f}秒 ({fp32_speedup:.2f}x CPU速度)")
print(
f"GPU FP16 平均每轮耗时: {gpu_fp16_time:.2f}秒 ({fp16_speedup:.2f}x CPU速度, {mixed_vs_full:.2f}x FP32速度)")

# 打印GPU利用率
if os.name == 'nt': # Windows系统
print("\n提示: 您可以通过任务管理器查看GPU利用率和显存使用情况")
elif os.name == 'posix': # Linux系统
print("\n提示: 您可以使用以下命令监控GPU:")
print(" - nvidia-smi (实时GPU状态)")
print(" - watch -n 1 nvidia-smi (每秒更新一次)")
else:
print("\n===== 性能结果 =====")
print(f"CPU 平均每轮耗时: {cpu_time:.2f}秒")
print("未检测到GPU,无法进行GPU性能对比")


if __name__ == "__main__":
main()
posted @ 2025-06-18 01:38  m516606428  阅读(180)  评论(0)    收藏  举报