我正在尝试在两个GPU服务器上并行训练一个简单的pytorch模型。我已经从源代码编译了pytorch。当进程在第二台服务器上运行时,程序将给出“ RuntimeError:没有路由到主机”。我该如何解决?
我尝试了pytorch,cuda,NCCL的anaconda安装和源安装。
我从https://yangkky.github.io/2019/07/08/distributed-pytorch-tutorial.html
复制了以下代码import os
from datetime import datetime
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist
#from apex.parallel import DistributedDataParallel as DDP
#from apex import amp
class ConvNet(nn.Module):
def __init__(self, num_classes=10):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
)
self.layer2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
)
self.fc = nn.Linear(7*7*32, num_classes)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.fc(out)
return out
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N')
parser.add_argument('-g', '--gpus', default=1, type=int, help='number of gpus per node')
parser.add_argument('-nr', '--nr', default=0, type=int, help='ranking within the nodes')
parser.add_argument('--epochs', default=2, type=int, metavar='N', help='number of total epochs to run')
args = parser.parse_args()
args.world_size = args.gpus * args.nodes
os.environ['MASTER_ADDR'] = '192.168.0.238'
os.environ['MASTER_PORT'] = '8888'
mp.spawn(train, nprocs=args.gpus, args=(args,))
#train(0, args)
def train(gpu, args):
rank = args.nr * args.gpus + gpu
dist.init_process_group(
backend='nccl',
init_method='env://',
world_size=args.world_size,
rank=rank
)
model = ConvNet()
print('gpu:', gpu)
torch.cuda.set_device(gpu)
model.cuda(gpu)
batch_size= 100
criterion = nn.CrossEntropyLoss().cuda(gpu)
optimizer = torch.optim.SGD(model.parameters(), 1e-4)
model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
train_dataset = torchvision.datasets.MNIST(root='./data',
train=True,
transform=transforms.ToTensor(),
download=True)
train_sampler = torch.utils.data.distributed.DistributedSample(
train_dataset,
num_replicas=args.world_size,
rank=rank
)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=0,
pin_memory=True,
sampler=train_sampler)
start = datetime.now()
total_step = len(train_loader)
for epoch in range(args.epochs):
for i, (images, labels) in enumerate(train_loader):
images = images.cuda(non_blocking=True)
labels = labels.cuda(non_blocking=True)
outputs = model(images)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i+1) % 100 == 0 and gpu == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
epoch + 1,
args.epochs,
i + 1,
total_step,
loss.item()
))
if gpu == 0:
print("Training complete in:" + str(datetime.now() - start))
if __name__ == '__main__':
main()
答案 0 :(得分:0)
这可能是由防火墙引起的。
按照https://superuser.com/a/1359965打开端口。
1。安装netcat
以帮助我们检查网络
对于Ubuntu:
apt-get install netcat
2。检查您的主流程是否可用
在 node0
(或主节点)上:
nc -vv localhost:<port>
输出Connection to localhost <port >port [tcp/tproxy] succeeded!
表示您的主进程正在正确运行。否则,请检查主节点上的程序是否运行正确。
3。关闭防火墙 如果主节点程序正在运行,并且假定两个节点连接良好,则可能是防火墙问题。选中https://superuser.com/a/1359965打开您的端口。这可能会解决问题。