我试图将DataParallel应用于RNN模型。
这是我的代码的一部分:
if use_cuda:
encoder = encoder.cuda()
decoder = decoder.cuda()
encoder = nn.DataParallel(encoder, dim=0)
decoder = nn.DataParallel(decoder, dim=0)
class EncoderRNN(nn.Module):
def __init__(self, vocal_size, hidden_size):
super(EncoderRNN, self).__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding(vocal_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
def forward(self, input_batch, input_batch_length, hidden):
print(input_batch)
print(input_batch_length)
print(hidden)
embedded = self.embedding(input_batch)
packed_input = nn.utils.rnn.pack_padded_sequence(embedded, input_batch_length.cpu().numpy(), batch_first=True)
output, hidden = self.gru(packed_input, hidden)
return output, hidden
def init_hidden(self, batch_size):
result = torch.autograd.Variable(torch.zeros(1, batch_size, self.hidden_size))
if use_cuda:
return result.cuda()
else:
return result
我可以保证所有输入都是cuda,但我收到了这个错误。似乎我的投入并不都是cuda。
Traceback (most recent call last):
File "train.py", line 156, in <module>
train_iteration(encoder, decoder, fileDataSet)
File "train.py", line 122, in train_iteration
target_indices, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
File "train.py", line 49, in train
encoder_output, encoder_hidden = encoder(input_batch, input_batch_length, encoder_hidden)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 74, in forward
return self.gather(outputs, self.output_device)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 86, in gather
return gather(outputs, output_device, dim=self.dim)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 65, in gather
return gather_map(outputs)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 60, in gather_map
return type(out)(map(gather_map, zip(*outputs)))
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 60, in gather_map
return type(out)(map(gather_map, zip(*outputs)))
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/utils/rnn.py", line 39, in __new__
return super(PackedSequence, cls).__new__(cls, *args[0])
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 57, in gather_map
return Gather.apply(target_device, dim, *outputs)
File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/_functions.py", line 58, in forward
assert all(map(lambda i: i.is_cuda, inputs))
AssertionError
此代码适用于单个GPU,但我在Google中未发现任何类似问题。
input_batch的内容:
2.0000e+00 6.2900e+02 5.4000e+01 ... 0.0000e+00 0.0000e+00 0.0000e+00
2.0000e+00 1.6759e+04 6.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00
2.0000e+00 7.2000e+01 3.3500e+02 ... 0.0000e+00 0.0000e+00 0.0000e+00
2.0000e+00 5.4000e+01 1.2900e+02 ... 0.0000e+00 0.0000e+00 0.0000e+00
[torch.cuda.LongTensor of size (4,2687) (GPU 0)]
input_batch_length:
1844
1507
1219
1021
[torch.cuda.LongTensor of size (4,) (GPU 0)]
隐藏的:
( 0 ,.,.) =
0 0 0 ... 0 0 0
0 0 0 ... 0 0 0
0 0 0 ... 0 0 0
0 0 0 ... 0 0 0
[torch.cuda.FloatTensor of size (1,4,256) (GPU 0)]