pytorch一机多卡训练

mac2025-09-14  8

1. 一机多卡(one matchine multi-GPU)

1.1 DataParallel

DataParallel(DP):Parameter Server模式,一张卡位reducer,实现也超级简单,一行代码。 有个不能接受的缺陷是:DataParallel是基于Parameter server的算法,所有的loss都在主卡上计算,负载不均衡的问题比较严重,有时在模型较大的时候(比如bert-large),主卡占满了,其他的卡一半都占不到,及其浪费资源。 示例代码:

# coding=utf-8 import torch import torch.nn as nn from torch.autograd import Variable from torch.utils.data import Dataset, DataLoader class RandomDataset(Dataset): def __init__(self, size, length): self.len = length self.data = torch.randn(length, size) def __getitem__(self, index): return self.data[index] def __len__(self): return self.len class Model(nn.Module): def __init__(self, input_size, output_size): super(Model, self).__init__() self.fc = nn.Linear(input_size, output_size) def forward(self, input): output = self.fc(input) return output input_size = 5 output_size = 2 batch_size = 30 data_size = 30 dataset = RandomDataset(input_size, data_size) rand_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True) model = Model(input_size, output_size) if torch.cuda.is_available(): model.cuda() if torch.cuda.device_count() > 1: model = nn.DataParallel(model) # 关键代码 for data in rand_loader: if torch.cuda.is_available(): input_var = Variable(data.cuda()) else: input_var = Variable(data) output = model(input_var)

1.2 DistributedDataParallel

是的,你没有看错,这个函数是为了分布式训练设计的。但是,即使在单机多卡上,官方也建议使用新的DistributedDataParallel,采用all-reduce算法。

(1)初始化后端

torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://')

(2)模型并行化 这里也很简单,使用DistributedDataParallel函数warp一下就可以:

model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank)

(3)数据并行 这里需要注意,如果指定了sampler,则shuffle=False,其中DataLoader的num_worker是每一个卡独立设置。

dataset = RandomDataset(input_size, data_size) sampler = torch.utils.data.distributed.DistributedSampler(dataset) rand_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, sampler=sampler)

(4)启动脚本

python -m torch.distributed.launch --nproc_per_node=8 train_face_troch.py

完整代码示例:

# coding=utf-8 import torch import torch.distributed import torch.nn as nn from torch.autograd import Variable from torch.utils.data import Dataset, DataLoader import apex import argparse class RandomDataset(Dataset): def __init__(self, size, length): self.len = length self.data = torch.randn(length, size) self.label = torch.mean(self.data, dim=-1) def __getitem__(self, index): return self.data[index], self.label[index] def __len__(self): return self.len class Model(nn.Module): def __init__(self, input_size, output_size): super(Model, self).__init__() self.fc = nn.Linear(input_size, output_size) def forward(self, input): output = self.fc(input) return output def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--local_rank', default=0, type=int) args = parser.parse_args() return args input_size = 5 output_size = 2 batch_size = 30 data_size = 30 args = parse_args() local_rank = args.local_rank torch.cuda.set_device(local_rank) # 设定cuda的默认GPU,每个rank不同 torch.distributed.init_process_group(backend='nccl', init_method='env://') dataset = RandomDataset(input_size, data_size) sampler = torch.utils.data.distributed.DistributedSampler(dataset) rand_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, sampler=sampler) model = Model(input_size, output_size) if torch.cuda.is_available(): model.cuda() model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank) optimizer = torch.optim.Adam(model.parameters()) criterion = torch.nn.CrossEntropyLoss() # if torch.cuda.device_count() > 1: # model = nn.DataParallel(model) for data, label in rand_loader: data = data.cuda() label = label.cuda() output = model(data) loss = criterion(output, label) optimizer.zero_grad() loss.backward() optimizer.step()

1.3 DistributedDataParallel + apex

大规模数据训练时,混合精度训练时必须的,这速度谁用谁知道。 参考: https://github.com/NVIDIA/apex/tree/master/examples/simple/distributed

这里主要需要改两个地方,一个是amp.initialize这个函数,一个是backward。

# coding=utf-8 import torch import torch.distributed import torch.nn as nn from torch.autograd import Variable from torch.utils.data import Dataset, DataLoader import apex import argparse class RandomDataset(Dataset): def __init__(self, size, length): self.len = length self.data = torch.randn(length, size) self.label = torch.mean(self.data, dim=-1) def __getitem__(self, index): return self.data[index], self.label[index] def __len__(self): return self.len class Model(nn.Module): def __init__(self, input_size, output_size): super(Model, self).__init__() self.fc = nn.Linear(input_size, output_size) def forward(self, input): output = self.fc(input) return output def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--local_rank', default=0, type=int) args = parser.parse_args() return args input_size = 5 output_size = 2 batch_size = 30 data_size = 30 args = parse_args() local_rank = args.local_rank # 初始化 torch.cuda.set_device(local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') dataset = RandomDataset(input_size, data_size) sampler = torch.utils.data.distributed.DistributedSampler(dataset) rand_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, sampler=sampler) model = Model(input_size, output_size) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank) optimizer = torch.optim.Adam(model.parameters()) model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # 这里是字母O criterion = torch.nn.CrossEntropyLoss() if torch.cuda.is_available(): model.cuda() if torch.cuda.device_count() > 1: model = nn.DataParallel(model) for data, label in rand_loader: data = data.cuda() label = label.cuda() output = model(data) loss = criterion(output, label) optimizer.zero_grad() #loss.backward() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step()

1.4 其他的问题

torch.load会根据之前保存的参数的GPU信息加载到对应的GPU上,但是在DistributedDataParallel 模式下需要加载到不同的GPU中。所以在torch.load的参数可以做如下设定 torch.load(params_path, map_location=lambda storge, loc: storge.cuda(self.local_rank))

2. 多机多卡(multi-matchine multi-GPU)

comming soon
最新回复(0)