Question

在我的LSTM模型的几个时期执行两次之后，出现THCudaCheck FAIL file=..\aten\src\THC\THCCachingHostAllocator.cpp line=278 error=719 : unspecified launch failure错误。错误堆栈跟踪将out, hidden = self.rnn(x, hidden)函数中的行forward指向错误原因。

这是我的网络模型：

import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

import numpy as np
import pandas as pd
from time import time


class SignalNet(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, num_layers):
        super(SignalNet, self).__init__()

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.rnn = nn.LSTM(input_size, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        batch_size = x.size(0)

        hidden = self.init_hidden(batch_size)

        out, hidden = self.rnn(x, hidden)

        out = out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(out)

        return out, hidden

    def init_hidden(self, batch_size):
        return (torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device),
                torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device))


def main():
    global device
    is_cuda = torch.cuda.is_available()

    if is_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    input_dim = 8
    batch_size = 1024  # was 32
    output_dim = 1
    num_layers = 5
    hidden_dim = 10
    learning_rate = 0.1
    num_epochs = 5

    model = SignalNet(input_size=input_dim, output_size=output_dim, hidden_dim=hidden_dim, num_layers=num_layers)
    model.to(device)

    loss_fn = torch.nn.MSELoss(reduction='sum')
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    df = pd.read_csv('data\\emg2.csv', sep=',')

    split_frac = 0.8  # 80% train, 20% test
    split_id = int(split_frac * len(df))

    train_data, train_labels = df.iloc[:split_id, :-1], df.iloc[:split_id, -1]
    test_data, test_labels = df.iloc[split_id:len(df) * 9 // 10, :-1], df.iloc[split_id:len(df) * 9 // 10, -1]
    val_data, val_labels = df.iloc[len(df) * 9 // 10:, :-1], df.iloc[len(df) * 9 // 10:, -1]

    # LSTM starts HERE
    train_dataset = TensorDataset(torch.from_numpy(train_data.values).float(),
                                  torch.from_numpy(train_labels.values).float())
    val_dataset = TensorDataset(torch.from_numpy(val_data.values).float(),
                                torch.from_numpy(val_labels.values).float())
    test_dataset = TensorDataset(torch.from_numpy(test_data.values).float(),
                                 torch.from_numpy(test_labels.values).float())

    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, num_workers=6)
    val_loader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size, num_workers=6)
    test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size, num_workers=6)

    # Training Run
    model.train()
    for epoch in range(1, num_epochs + 1):
        epoch_loss = 0
        index = 0
        for batch_idx, (X_train, y_train) in enumerate(train_loader):
            optimizer.zero_grad()  # Clears existing gradients from previous epoch
            X_train, y_train = X_train.to(device), y_train.to(device)
            # model.init_hidden(batch_size)
            output, hidden = model(X_train[np.newaxis, ...])
            # loss = loss_fn(output, y_train.view(-1).long())
            loss = loss_fn(output.view(-1), y_train.view(-1))
            epoch_loss += loss.item()
            loss.backward()  # Does backpropagation and calculates gradients
            optimizer.step()  # Updates the weights accordingly

            if batch_idx % 50 == 0:
                print(f'Epoch: {epoch}/{num_epochs} Batch #{batch_idx + 1}/{len(train_loader)}.............', end=' ')
                print("Loss: {:.4f}".format(loss.item()))
            index = batch_idx

        print(f'Epoch #{epoch}: Avg. loss: {epoch_loss / index + 1}')

    # TEST
    test_losses = []
    num_correct = 0
    h = model.init_hidden(batch_size)

    model.eval()
    for inputs, labels in test_loader:
        h = tuple([each.data for each in h])
        inputs, labels = inputs.to(device), labels.to(device)
        output, h = model(inputs[np.newaxis, ...])
        test_loss = loss_fn(output.squeeze(), labels.float())
        test_losses.append(test_loss.item())
        pred = torch.round(output.squeeze())  # Rounds the output to 0/1
        correct_tensor = pred.eq(labels.float().view_as(pred))
        correct = np.squeeze(correct_tensor.cpu().numpy())
        num_correct += np.sum(correct)

    print("Test loss: {:.3f}".format(np.mean(test_losses)))
    test_acc = num_correct / len(test_loader.dataset)
    print("Test accuracy: {:.3f}%".format(test_acc * 100))


if __name__ == '__main__':
    main()

这是错误的堆栈跟踪：

Epoch: 1/5 Batch #1/235............. Loss: 5569.8594
Epoch: 1/5 Batch #51/235............. Loss: 3521.5000
Epoch: 1/5 Batch #101/235............. Loss: 3525.8384
Epoch: 1/5 Batch #151/235............. Loss: 3854.3398
Epoch: 1/5 Batch #201/235............. Loss: 3402.1775
Epoch #1: Avg. loss: 3645.545551528279
Epoch: 2/5 Batch #1/235............. Loss: 3649.2971
Traceback (most recent call last):
  File "C:\Program Files\JetBrains\PyCharm 2018.1.2\helpers\pydev\pydevd.py", line 1415, in _exec
    pydev_imports.execfile(file, globals, locals)  # execute the script
  File "C:\Program Files\JetBrains\PyCharm 2018.1.2\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
    exec(compile(contents+"\n", file, 'exec'), glob, loc)
  File "D:/PyCharmProjects/signal_processing/run_5.py", line 149, in <module>
    main()
  File "D:/PyCharmProjects/signal_processing/run_5.py", line 111, in main
    output, hidden = model(X_train[np.newaxis, ...])
  File "D:\.virtualenvs\signal_processing-0P6HK7Nb\lib\site-packages\torch\nn\modules\module.py", line 541, in __call__
    result = self.forward(*input, **kwargs)
  File "D:/PyCharmProjects/signal_processing/run_5.py", line 32, in forward
    out, hidden = self.rnn(x, hidden)
  File "D:\.virtualenvs\signal_processing-0P6HK7Nb\lib\site-packages\torch\nn\modules\module.py", line 541, in __call__
    result = self.forward(*input, **kwargs)
  File "D:\.virtualenvs\signal_processing-0P6HK7Nb\lib\site-packages\torch\nn\modules\rnn.py", line 564, in forward
    return self.forward_tensor(input, hx)
  File "D:\.virtualenvs\signal_processing-0P6HK7Nb\lib\site-packages\torch\nn\modules\rnn.py", line 543, in forward_tensor
    output, hidden = self.forward_impl(input, hx, batch_sizes, max_batch_size, sorted_indices)
  File "D:\.virtualenvs\signal_processing-0P6HK7Nb\lib\site-packages\torch\nn\modules\rnn.py", line 526, in forward_impl
    self.dropout, self.training, self.bidirectional, self.batch_first)
RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR
THCudaCheck FAIL file=..\aten\src\THC\THCCachingHostAllocator.cpp line=278 error=719 : unspecified launch failure

PyTorch-THCudaCheck FAIL file = .. \ aten \ src \ THC \ THCCachingHostAllocator.cpp line = 278 error = 719：几次启动后未指定启动失败？

0 个答案: