在我的LSTM模型的几个时期执行两次之后,出现THCudaCheck FAIL file=..\aten\src\THC\THCCachingHostAllocator.cpp line=278 error=719 : unspecified launch failure
错误。错误堆栈跟踪将out, hidden = self.rnn(x, hidden)
函数中的行forward
指向错误原因。
这是我的网络模型:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import numpy as np
import pandas as pd
from time import time
class SignalNet(nn.Module):
def __init__(self, input_size, output_size, hidden_dim, num_layers):
super(SignalNet, self).__init__()
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.rnn = nn.LSTM(input_size, hidden_dim, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_dim, output_size)
def forward(self, x):
batch_size = x.size(0)
hidden = self.init_hidden(batch_size)
out, hidden = self.rnn(x, hidden)
out = out.contiguous().view(-1, self.hidden_dim)
out = self.fc(out)
return out, hidden
def init_hidden(self, batch_size):
return (torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device),
torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device))
def main():
global device
is_cuda = torch.cuda.is_available()
if is_cuda:
device = torch.device("cuda")
else:
device = torch.device("cpu")
input_dim = 8
batch_size = 1024 # was 32
output_dim = 1
num_layers = 5
hidden_dim = 10
learning_rate = 0.1
num_epochs = 5
model = SignalNet(input_size=input_dim, output_size=output_dim, hidden_dim=hidden_dim, num_layers=num_layers)
model.to(device)
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
df = pd.read_csv('data\\emg2.csv', sep=',')
split_frac = 0.8 # 80% train, 20% test
split_id = int(split_frac * len(df))
train_data, train_labels = df.iloc[:split_id, :-1], df.iloc[:split_id, -1]
test_data, test_labels = df.iloc[split_id:len(df) * 9 // 10, :-1], df.iloc[split_id:len(df) * 9 // 10, -1]
val_data, val_labels = df.iloc[len(df) * 9 // 10:, :-1], df.iloc[len(df) * 9 // 10:, -1]
# LSTM starts HERE
train_dataset = TensorDataset(torch.from_numpy(train_data.values).float(),
torch.from_numpy(train_labels.values).float())
val_dataset = TensorDataset(torch.from_numpy(val_data.values).float(),
torch.from_numpy(val_labels.values).float())
test_dataset = TensorDataset(torch.from_numpy(test_data.values).float(),
torch.from_numpy(test_labels.values).float())
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, num_workers=6)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size, num_workers=6)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size, num_workers=6)
# Training Run
model.train()
for epoch in range(1, num_epochs + 1):
epoch_loss = 0
index = 0
for batch_idx, (X_train, y_train) in enumerate(train_loader):
optimizer.zero_grad() # Clears existing gradients from previous epoch
X_train, y_train = X_train.to(device), y_train.to(device)
# model.init_hidden(batch_size)
output, hidden = model(X_train[np.newaxis, ...])
# loss = loss_fn(output, y_train.view(-1).long())
loss = loss_fn(output.view(-1), y_train.view(-1))
epoch_loss += loss.item()
loss.backward() # Does backpropagation and calculates gradients
optimizer.step() # Updates the weights accordingly
if batch_idx % 50 == 0:
print(f'Epoch: {epoch}/{num_epochs} Batch #{batch_idx + 1}/{len(train_loader)}.............', end=' ')
print("Loss: {:.4f}".format(loss.item()))
index = batch_idx
print(f'Epoch #{epoch}: Avg. loss: {epoch_loss / index + 1}')
# TEST
test_losses = []
num_correct = 0
h = model.init_hidden(batch_size)
model.eval()
for inputs, labels in test_loader:
h = tuple([each.data for each in h])
inputs, labels = inputs.to(device), labels.to(device)
output, h = model(inputs[np.newaxis, ...])
test_loss = loss_fn(output.squeeze(), labels.float())
test_losses.append(test_loss.item())
pred = torch.round(output.squeeze()) # Rounds the output to 0/1
correct_tensor = pred.eq(labels.float().view_as(pred))
correct = np.squeeze(correct_tensor.cpu().numpy())
num_correct += np.sum(correct)
print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct / len(test_loader.dataset)
print("Test accuracy: {:.3f}%".format(test_acc * 100))
if __name__ == '__main__':
main()
这是错误的堆栈跟踪:
Epoch: 1/5 Batch #1/235............. Loss: 5569.8594
Epoch: 1/5 Batch #51/235............. Loss: 3521.5000
Epoch: 1/5 Batch #101/235............. Loss: 3525.8384
Epoch: 1/5 Batch #151/235............. Loss: 3854.3398
Epoch: 1/5 Batch #201/235............. Loss: 3402.1775
Epoch #1: Avg. loss: 3645.545551528279
Epoch: 2/5 Batch #1/235............. Loss: 3649.2971
Traceback (most recent call last):
File "C:\Program Files\JetBrains\PyCharm 2018.1.2\helpers\pydev\pydevd.py", line 1415, in _exec
pydev_imports.execfile(file, globals, locals) # execute the script
File "C:\Program Files\JetBrains\PyCharm 2018.1.2\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "D:/PyCharmProjects/signal_processing/run_5.py", line 149, in <module>
main()
File "D:/PyCharmProjects/signal_processing/run_5.py", line 111, in main
output, hidden = model(X_train[np.newaxis, ...])
File "D:\.virtualenvs\signal_processing-0P6HK7Nb\lib\site-packages\torch\nn\modules\module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File "D:/PyCharmProjects/signal_processing/run_5.py", line 32, in forward
out, hidden = self.rnn(x, hidden)
File "D:\.virtualenvs\signal_processing-0P6HK7Nb\lib\site-packages\torch\nn\modules\module.py", line 541, in __call__
result = self.forward(*input, **kwargs)
File "D:\.virtualenvs\signal_processing-0P6HK7Nb\lib\site-packages\torch\nn\modules\rnn.py", line 564, in forward
return self.forward_tensor(input, hx)
File "D:\.virtualenvs\signal_processing-0P6HK7Nb\lib\site-packages\torch\nn\modules\rnn.py", line 543, in forward_tensor
output, hidden = self.forward_impl(input, hx, batch_sizes, max_batch_size, sorted_indices)
File "D:\.virtualenvs\signal_processing-0P6HK7Nb\lib\site-packages\torch\nn\modules\rnn.py", line 526, in forward_impl
self.dropout, self.training, self.bidirectional, self.batch_first)
RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR
THCudaCheck FAIL file=..\aten\src\THC\THCCachingHostAllocator.cpp line=278 error=719 : unspecified launch failure