分布式训练神经网络
DistributedDataParallel
PyTorch 中的 DistributedDataParallel 和 Elastic 是用于支持分布式训练的两个相关组件,常结合使用。
- DistributedDataParallel 用于在多个 GPU 上实现数据并行的分布式训练。其在每个 GPU 上运行模型的一个副本,并独立进行前向和后向传播。梯度在每次反向传播后进行同步,从而确保所有模型副本保持一致。
- Elastic 用于实现分布式训练的弹性调度和故障恢复。其允许在分布式训练过程中动态地加减节点,并能够容忍节点故障后的自动恢复。
模型定义
import torch.nn as nn
class SimpleModel(nn.Module):
def __init__(self, input_dim: int = 128, hidden_dim: int = 256, output_dim: int = 10):
super().__init__()
self.net = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, output_dim),
)
def forward(self, x):
return self.net(x)
初始化分布式进程组
import torch.distributed as dist
world_size = int(os.environ.get("WORLD_SIZE", 1)) # 全局进程数
rank = int(os.environ.get("RANK", 0)) # 全局进程号
local_rank = int(os.environ.get("LOCAL_RANK", 0)) # 本地进程号
th.cuda.set_device(local_rank) # 设置默认 GPU
# 初始化分布式进程组
dist.init_process_group(
backend='nccl',
init_method='env://',
world_size=world_size,
rank=rank,
device_id=local_rank
)
数据加载
使用 DistributedSampler 将数据集根据 GPU 数划分成若干子集。
from torch.utils.data import TensorDataset, DistributedSampler, DataLoader
def create_dummy_dataset(num_samples = 1000, input_dim = 128, output_dim = 10):
"""Create a dummy classification dataset."""
X = th.randn(num_samples, input_dim)
y = th.randint(0, output_dim, (num_samples,))
return TensorDataset(X, y)
dataset = create_dummy_dataset(num_samples=1000, input_dim=128, output_dim=10)
# 数据采样器,将数据集按 GPU 数划分
sampler = DistributedSampler(
dataset,
num_replicas=world_size,
rank=rank,
shuffle=True,
seed=42,
)
# 数据加载器,使用采样器加载数据
dataloader = DataLoader(
dataset,
batch_size=32,
sampler=sampler,
num_workers=0,
)
创建模型
使用 DistributedDataParallel 创建同步模型。
from torch.nn.parallel import DistributedDataParallel as DDP
# 创建模型并移动到当前 GPU
model = SimpleModel(input_dim=128, hidden_dim=256, output_dim=10)
model = model.to(device)
model = DDP(model, device_ids=[local_rank])
# 创建优化器和损失函数
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
训练
def train_epoch(model, dataloader, optimizer, criterion, device, rank):
"""Train for one epoch."""
model.train()
total_loss = 0.0
num_batches = 0
for batch_idx, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
optimizer.zero_grad()
logits = model(X)
loss = criterion(logits, y)
loss.backward()
optimizer.step()
total_loss += loss.item()
num_batches += 1
# Log on rank 0 only
if rank == 0 and batch_idx % 10 == 0:
print(f" Batch {batch_idx}, Loss: {loss.item():.4f}")
avg_loss = total_loss / num_batches
return avg_loss
# Training loop
epochs = 3
for epoch in range(epochs):
# Important: shuffle data differently each epoch
sampler.set_epoch(epoch)
if rank == 0:
print(f"\nEpoch {epoch + 1}/{epochs}")
avg_loss = train_epoch(model, dataloader, optimizer, criterion, device, rank)
# Synchronize loss across all processes
loss_tensor = th.tensor([avg_loss], device=device)
dist.all_reduce(loss_tensor)
global_avg_loss = (loss_tensor.item() / world_size)
if rank == 0:
print(f" Avg Loss: {global_avg_loss:.4f}")
if rank == 0:
print("\nTraining completed!")
print("=" * 60)
# 退出并清理分布式进程组
dist.destroy_process_group()
Elastic 分布式调度
可以使用 torchrun 命令启动分布式训练:
torchrun \
--nnodes 1 \
--nproc-per-node auto \
--rdzv-id 0 \
--rdzv-backend c10d \
--rdzv-endpoint localhost:0 \
main.py
--nnodes:节点数--nproc-per-node:当前节点上的进程数--rdzv-id:作业号--rdzv-backend:通信后端--rdzv-endpoint:通信端点
如果使用多节点训练,需要在每个节点上执行 torchrun 命令,或者使用 Slurm 等集群作业管理工具。
参考:
- Getting Started with Distributed Data Parallel | PyTorch Tutorials
- Multinode Training | PyTorch Tutorials
- PyTorch Elastic Quickstart | PyTorch documentation
- torchrun (Elastic Launch) | PyTorch documentation
Accelerate(推荐)
Accelerate 是 Hugging Face 对 PyTorch DDP 的包装,使 DDP 更好用。
pip install accelerate # 安装 accelerate
训练
使用 accelerate 实现分布式训练很简单,只需对原有代码进行以下调整:
+from accelerate import Accelerator
-world_size = int(os.environ.get("WORLD_SIZE", 1))
-rank = int(os.environ.get("RANK", 0))
-local_rank = int(os.environ.get("LOCAL_RANK", 0))
-th.cuda.set_device(local_rank)
-device = th.device(f"cuda:{local_rank}")
-dist.init_process_group(backend="nccl", ...)
-sampler = DistributedSampler(dataset, ...)
-model = DDP(model, device_ids=[local_rank])
+accelerator = Accelerator()
+model, optimizer, dataloader, scheduler = accelerator.prepare(
+ model, optimizer, dataloader, scheduler
+)
for epoch in range(args.epochs):
- sampler.set_epoch(epoch)
for samples, labels in dataloader:
- samples = samples.to(device)
- labels = labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, targets)
optimizer.zero_grad()
- loss.backward()
+ accelerator.backward(loss)
optimizer.step()
scheduler.step()
-dist.destroy_process_group()
启动分布式训练:
accelerate launch \
--multi_gpu \
--main_process_ip '127.0.0.1' \
--main_process_port 29500 \
--num_machines 1 \
--num_processes 8 \
--machine_rank=0 \
--dynamo_backend no \
--mixed_precision bf16 \
train.py
--multi_gpu:启用分布式训练框架--main_process_ip:主进程地址--main_process_port:主进程端口--num_machines:节点数--num_processes:进程数--machine_rank:当前节点序号--dynamo_backend:Dynamo 后端,默认为no--mixed_precision:混合精度模式,可选值{no,fp16,bf16,fp8}
因为 Accelerate 是对 PyTorch DDP 的封装,因此也可以使用 torchrun 启动分布式训练。
参考:Accelerate | Hugging Face Docs
Trainer
Trainer 是最高层次的抽象,我们连训练流程代码都不用写了,只需设置训练参数即可。Trainer 会为我们料理一切:
from transformers import Trainer, TrainingArguments
model = SimpleModel()
# 设置超参数
training_args = TrainingArguments(
"basic-trainer",
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
num_train_epochs=1,
evaluation_strategy="epoch",
remove_unused_columns=False
)
def collate_fn(examples):
"""生成训练数据"""
x = th.stack([example[0] for example in examples])
labels = th.tensor([example[1] for example in examples])
return { "x": x, "labels": labels }
class SimpleTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
"""计算 loss"""
outputs = model(inputs["x"])
targets = inputs["labels"]
loss = F.nll_loss(outputs, targets)
return (loss, outputs) if return_outputs else loss
trainer = SimpleTrainer(
model,
training_args,
train_dataset=train_dset,
eval_dataset=test_dset,
data_collator=collate_fn
)
trainer.fit()
参考:从 PyTorch DDP 到 Accelerate 到 Trainer,轻松掌握分布式训练 | Hugging Face

浙公网安备 33010602011771号