Question

我无法在colab实验室的gpu上开始训练我的pytorch模型进行机器翻译。这是错误：
RuntimeError: Expected object of backend CPU but got backend CUDA for argument #4 'mat1'
出现在此代码之后： self.attn_weights = F.softmax(self.attn(self.attn_weights), dim=1).to(device)

问题是我已经将设备的Attention类和model类放入了。我尝试将gpu变量放在tention.forward（）方法中，出现此错误，但并没有帮助。这是定义模型的代码：

input_dim = len(SRC.vocab)
output_dim = len(TRG.vocab)
src_embd_dim =  tgt_embd_dim = 256
#hidden_dim = 512
hidden_dim = 256
#num_layers =  2
num_layers = 1
dropout_prob = 0.2

batch_size = 64
PAD_IDX = TRG.vocab.stoi['<pad>']

iterators = BucketIterator.splits((train_data, valid_data, test_data),
                                  batch_size = batch_size, device = device)
train_iterator, valid_iterator, test_iterator = iterators

attention = Attention(batch_size, hidden_dim, "dot").to(device)

enc = Encoder(input_dim, src_embd_dim, hidden_dim, num_layers, dropout_prob)
dec = DecoderAttn(output_dim, tgt_embd_dim, hidden_dim, num_layers, attention, dropout_prob)
model = Seq2Seq(enc, dec, device).to(device)

此Attention和Decoder具有注意等级，如果可以使用的话：

class Attention(nn.Module):
    def __init__(self, batch_size, hidden_size, method="cat"): # add parameters needed for your type of attention
        super(Attention, self).__init__()
        self.method = method # attention method you'll use. e.g. "cat", "one-layer-net", "dot", ...
        #<YOUR CODE HERE>
        self.batch_size = batch_size
        self.hidden_size = hidden_size

    def forward(self, embedded, last_hidden, encoder_outputs, seq_len=None):
        self.max_length = encoder_outputs.shape[0]
        if self.method == "cat":
            self.attn = nn.Linear(self.hidden_size * 2, self.hidden_size)
        if self.method == "dot":
            self.attn = nn.Linear(self.hidden_size, self.hidden_size)
        #self.attn_combine = nn.Linear(self.batch_size * 2, self.hidden_size)

        if self.method == "cat" or self.method == "dot":
            print(device)
            if self.method == "cat":
                self.attn_weights = torch.cat((embedded[0], last_hidden[0]), 1).to(device)
            elif self.method == "dot":
                self.attn_weights = torch.mul(embedded[0], last_hidden[0]).to(device)

            '''print(attn_weights.unsqueeze(1).shape, encoder_outputs.transpose(0, 1).shape)
            attn_applied = torch.bmm(attn_weights.unsqueeze(1),
                                     encoder_outputs.transpose(0, 1))
            print(embedded.shape, attn_applied.shape)
            output = torch.cat((embedded.transpose(0, 1),
                               attn_applied), 1)
            print(output.shape)
            output = self.attn_combine(output)
            return output'''

            self.attn_weights = F.softmax(self.attn(self.attn_weights), dim=1).to(device)
            self.attn_weights = self.attn_weights.unsqueeze(0)
            return self.attn_weights
        else:
            raise NotImplementedError



class DecoderAttn(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, attention, dropout=0.1):
        super(DecoderAttn, self).__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers

        self.attention = attention # instance of Attention class

        self.dropout = dropout

        # define layers
        self.embedding = nn.Embedding(self.output_dim, self.emb_dim)

        #self.rnn = nn.LSTM(self.hid_dim, self.hid_dim, self.n_layers) #(lstm embd, hid, layers, dropout)
        self.rnn = nn.GRU(self.hid_dim, self.hid_dim)
        self.out = nn.Linear(self.hid_dim, self.output_dim) # Projection :hid_dim x output_dim
        self.dropout = nn.Dropout(dropout)

        # more layers you'll need for attention
        #<YOUR CODE HERE>

    def forward(self, input_, hidden, encoder_output):
        # make decoder with attention
        # use code from seminar notebook as base and add attention to it
        #<YOUR CODE HERE>
        input_ = input_.unsqueeze(0)

        # (1 x batch_size x emb_dim)
        embedded = self.embedding(input_)# embd over input and dropout 
        embedded = self.dropout(embedded)

        output = self.attention.forward(embedded, hidden, encoder_output)
        output = F.relu(output)

        output, hidden = self.rnn(output, hidden)
        prediction = F.log_softmax(self.out(output[0]), dim=1)


        #prediction = self.out(output.squeeze(0)) #project out of the rnn on the output dim 

        return prediction, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

因此，这是我在colab上的笔记本（如果有Google帐户，则可以使用它）： https://colab.research.google.com/drive/18CQ0jHGsiK8ValtgI3EIKAQDRF_-y-cA）

对于cpu而不是gpu，应该使用什么变量？

0 个答案: