为什么在PyTorch中发生此CUDA错误?

时间:2020-07-15 09:38:05

标签: python deep-learning pytorch

在构建RNN模型的过程中,遇到以下错误。以下是我的代码的一部分:

class RNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(emb_num, emb_size)
        self.dropout1 = nn.Dropout(dropout_rate) 
        self.LSTM = nn.LSTM(50, 128, 1, bidirectional = True)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.full_connect = nn.Linear(256 , 5) #  biLSTM state * 2
    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(1,0,2)
        x = self.dropout1(x)
        _, (hn, cn) = self.LSTM(x) 
        out = self.dropout2(hn)
        #print(out.shape)
        out = torch.cat([out[i, :, :] for i in range(2)], 1)
        out = out.squeeze()
        out = self.full_connect(out) 
        return out

def train():
    optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate, weight_decay = 0.001)
    Loss = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        max_acc = 0
        print('epoch:{}'.format(epoch + 1))
        for i, data in enumerate(trainloader, 0):
            X_train, y_train = data
            optimizer.zero_grad()
            X_train = X_train.long().to(device)
            y_train = y_train.long().to(device)
            output = model(X_train)
            loss = Loss(output, y_train)
            loss.backward()
            optimizer.step()
        print('loss:{:3f}'.format(loss))
        model.eval()
        acc = valid(validloader)
        print('epoch:{} acc:{}'.format(epoch+1, acc))
        if epoch + 1 == 50:
            torch.save(model.state_dict(), 'epoch50.pt')
        if acc > max_acc:
            max_acc = acc
            torch.save(model.state_dict(), 'max_acc model.pt')
    torch.save(model.state_dict(), 'final model.pt')

def valid(dataloader):
    correct = 0
    total = 0
    with torch.no_grad():
        for i, data in enumerate(dataloader, 0):
            X_train, y_train = data
            #optimizer.zero_grad()
            X_train = X_train.long().to(device)
            y_train = y_train.long().to(device)
            output = model(X_train)
            #loss = Loss(output, y_train)
            #loss.backward()
            #optimizer.step()
            correct += (torch.argmax(output, dim = 1) == y_train).sum().item()
            total += y_train.shape[0]
    return correct / total

在上面的代码中,我创建了一个devset来在训练中测试模型。但是在4个或更多个时间之后,发生了此错误:

Traceback (most recent call last):
  File "c:\Users\hhhh\Desktop\NLP-beginner\task2\task2.py", line 287, in <module>
    train()
  File "c:\Users\hhhh\Desktop\NLP-beginner\task2\task2.py", line 185, in train
    acc = valid(validloader)
  File "c:\Users\hhhh\Desktop\NLP-beginner\task2\task2.py", line 207, in valid
    correct += (torch.argmax(output, dim = 1) == y_train).sum().item()
RuntimeError: CUDA error: unspecified launch failure

我试图切换到cpu设备来训练模型,但是训练速度甚至降低了1个纪元。是因为我的计算机配置不足以运行它吗?

1 个答案:

答案 0 :(得分:1)

要检查您的系统是否具有CUDA:

from torch.cuda import is_available


def main():
    use_cuda = not args.no_cuda and is_available()
    dev = device("cuda" if use_cuda else "cpu")
    model = RNN().to(device=dev)
    # Call train and test methods below


if __name__ == '__main__':
    main()