# 2 异步SGD

## 2.1 算法描述与实现

def run(rank, world_size):
os.environ['MASTER_PORT'] = '29500'
options=rpc.TensorPipeRpcBackendOptions(
rpc_timeout=0  # infinite timeout
)
if rank == 0:
rpc.init_rpc(
"ps",
rank=rank,
world_size=world_size,
rpc_backend_options=options
)
run_ps([f"trainer{r}" for r in range(1, world_size)])
else:
rpc.init_rpc(
f"trainer{rank}",
rank=rank,
world_size=world_size,
rpc_backend_options=options
)
# trainer passively waiting for ps to kick off training iterations

# block until all rpcs finish
rpc.shutdown()

if __name__=="__main__":
world_size = n_workers + 1
mp.spawn(run, args=(world_size, ), nprocs=world_size, join=True)


def run_trainer(ps_rref, train_dataset):
trainer = Trainer(ps_rref)
trainer.train(train_dataset)

def run_ps(trainers):
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
transform=transform)
local_train_datasets = dataset_split(train_dataset, n_workers)

print(f"{datetime.now().strftime('%H:%M:%S')} Start training")
ps = ParameterServer()
ps_rref = rpc.RRef(ps)
futs = []
for idx, trainer in enumerate(trainers):
futs.append(
rpc.rpc_async(trainer, run_trainer, args=(ps_rref, local_train_datasets[idx]))
)

torch.futures.wait_all(futs)
print(f"{datetime.now().strftime('%H:%M:%S')} Finish training")
ps.evaluation()


class CustomSubset(Subset):
'''A custom subset class with customizable data transformation'''
def __init__(self, dataset, indices, subset_transform=None):
super().__init__(dataset, indices)
self.subset_transform = subset_transform

def __getitem__(self, idx):
x, y = self.dataset[self.indices[idx]]
if self.subset_transform:
x = self.subset_transform(x)
return x, y

def __len__(self):
return len(self.indices)

def dataset_split(dataset, n_workers):
n_samples = len(dataset)
n_sample_per_workers = n_samples // n_workers
local_datasets = []
for w_id in range(n_workers):
if w_id < n_workers - 1:
local_datasets.append(CustomSubset(dataset, range(w_id * n_sample_per_workers, (w_id + 1) * n_sample_per_workers)))
else:
local_datasets.append(CustomSubset(dataset, range(w_id * n_sample_per_workers, n_samples)))
return local_datasets


class ParameterServer(object):

def __init__(self, n_workers=n_workers):
self.model = Net().to(device)
self.future_model = torch.futures.Future()
self.n_workers = n_workers
self.curr_update_size = 0
self.optimizer = optim.SGD(self.model.parameters(), lr=0.001, momentum=0.9)
for p in self.model.parameters():
datasets.MNIST('../data', train=False,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=32, shuffle=True)

def get_model(self):
# TensorPipe RPC backend only supports CPU tensors,
# so we move your tensors to CPU before sending them over RPC
return self.model.to("cpu")

@staticmethod
@rpc.functions.async_execution
self = ps_rref.local_value()
for p, g in zip(self.model.parameters(), grads):
with self.lock:
self.curr_update_size += 1
fut = self.future_model

if self.curr_update_size >= self.n_workers:
for p in self.model.parameters():
self.curr_update_size = 0
self.optimizer.step()
fut.set_result(self.model)
self.future_model = torch.futures.Future()

return fut

def evaluation(self):
self.model.eval()
self.model = self.model.to(device)
test_loss = 0
correct = 0
output = self.model(data.to(device))
test_loss += F.nll_loss(output, target.to(device), reduction='sum').item() # sum up batch loss
pred = output.max(1)[1] # get the index of the max log-probability
correct += pred.eq(target.to(device)).sum().item()

print('\nTest result - Accuracy: {}/{} ({:.0f}%)\n'.format(


TensorPipe RPC backend only supports CPU tensors, please move your tensors to CPU before sending them over RPC.
Found tensor on device: cuda:0


class Trainer(object):

def __init__(self, ps_rref):
self.ps_rref = ps_rref
self.model = Net().to(device)

def train(self, train_dataset):
model = self.ps_rref.rpc_sync().get_model().cuda()
pid = os.getpid()
for epoch in range(epochs):
for batch_idx, (data, target) in enumerate(train_loader):
output = model(data.to(device))
loss = F.nll_loss(output, target.to(device))
loss.backward()
model = rpc.rpc_sync(
self.ps_rref.owner(),
ParameterServer.update_and_fetch_model,
args=(self.ps_rref, [p.grad for p in model.cpu().parameters()]),
).cuda()
if batch_idx % log_interval == 0:
print('{}\tTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
pid, epoch + 1, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))


Test result - Accuracy: 9696/10000 (97%)


## 2.2 收敛性分析

ASGD避开了同步开销，但会给模型更新增加一些延迟。我们下面将ASGD的工作流程用下图加以剖析来解释这一点。用$$\text{worker}(k)$$来代表第$$k$$个工作节点，用$$w^t$$来代表第$$t$$轮迭代时服务端的全局模型。按照时间顺序，首先$$\text{worker}(k)$$先从参数服务器获取全局模型$$w^t$$，再根据本地数据计算模型梯度$$g(w_t)$$并将其发往参数服务器。一段时间后，$$\text{worker}(k')$$也从参数服务器取回当时的全局模型$$w^{t+1}$$，并同样依据它的本地数据计算模型的梯度$$f(w^{t+1})$$。注意，在$$\text{worker}(k')$$取回参数并进行计算的过程中，其它工作节点（比如$$\text{worker}(k)$$）可能已经将它的梯度提交给服务器并进行更新了。所以当$$\text{worker}(k')$$将其梯度$$g(w^{t+1})$$发给服务器时，全局模型已经不再是$$w^{t+1}$$，而已经是被更新过的版本。

# 3 Hogwild!算法

## 3.1 算法描述与实现

Hogwild！算法[3]为了提高训练过程中的数据吞吐量，选择了无锁的全局模型访问，其工作逻辑如下所示：

from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.multiprocessing as mp
from torchvision import datasets, transforms
import os
import torch
import torch.optim as optim
import torch.nn.functional as F

batch_size = 64 # input batch size for training
test_batch_size = 1000 # input batch size for testing
epochs = 10 # number of global epochs to train
lr = 0.01 # learning rate
momentum = 0.5 # SGD momentum
seed = 1 # random seed
log_interval = 10 # how many batches to wait before logging training status
n_workers = 4 # how many training processes to use
cuda = True # enables CUDA training
mps = False # enables macOS GPU training
dry_run = False # quickly check a single pass

def train(rank, model, device, dataset, dataloader_kwargs):
torch.manual_seed(seed + rank)

optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
for epoch in range(1, epochs + 1):
model.train()
pid = os.getpid()
for batch_idx, (data, target) in enumerate(train_loader):
output = model(data.to(device))
loss = F.nll_loss(output, target.to(device))
loss.backward()
optimizer.step()
if batch_idx % log_interval == 0:
print('{}\tTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
pid, epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
if dry_run:
break

torch.manual_seed(seed)

model.eval()
test_loss = 0
correct = 0
output = model(data.to(device))
test_loss += F.nll_loss(output, target.to(device), reduction='sum').item() # sum up batch loss
pred = output.max(1)[1] # get the index of the max log-probability
correct += pred.eq(target.to(device)).sum().item()

print('\nTest set: Global loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(

class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
self.conv2_drop = nn.Dropout2d()
self.fc1 = nn.Linear(320, 50)
self.fc2 = nn.Linear(50, 10)

def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x), 2))
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
x = x.view(-1, 320)
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return F.log_softmax(x, dim=1)

if __name__ == '__main__':
use_cuda = cuda and torch.cuda.is_available()
use_mps = mps and torch.backends.mps.is_available()
if use_cuda:
device = torch.device("cuda")
elif use_mps:
device = torch.device("mps")
else:
device = torch.device("cpu")

print(device)

transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
transform=transform)
test_dataset = datasets.MNIST('../data', train=False,
transform=transform)
kwargs = {'batch_size': batch_size,
'shuffle': True}
if use_cuda:
kwargs.update({'num_workers': 1,
'pin_memory': True,
})

torch.manual_seed(seed)
mp.set_start_method('spawn', force=True)

model = Net().to(device)
model.share_memory() # gradients are allocated lazily, so they are not shared here

processes = []
for rank in range(n_workers):
p = mp.Process(target=train, args=(rank, model, device,
train_dataset, kwargs))
# We first train the model across n_workers processes
p.start()
processes.append(p)

for p in processes:
p.join()

# Once training is complete, we can test the model
test(model, device, test_dataset, kwargs)


Test set: Global loss: 0.0325, Accuracy: 9898/10000 (99%)


## 3.2 收敛性分析

$l(w) = \sum_{e\in E}f_e(w_e)$

$\Omega:=\max_{e\in E}|e|\\ \Delta:=\frac{\underset{1\leqslant v \leqslant n}{\max}|\{e\in E: v\in e\}|}{|E|}\\ \rho:=\frac{\underset{e\in E}{\max}|\{\hat{e}\in E: \hat{e}\cap e \neq \emptyset \}|}{|E|}$

# 参考

posted @ 2023-02-13 22:07  orion-orion  阅读(727)  评论(0编辑  收藏  举报