我正在尝试使用GPU训练模型。
执行它时,我在下面收到此错误:
File "main.py", line 95, in train loss.backward()
File "/opt/conda/lib/python3.7/site-packages/torch/tensor.py", line 198, in backward torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/opt/conda/lib/python3.7/site-packages/torch/autograd/__init__.py", line 100, in backward allow_unreachable=True) # allow_unreachable flag
RuntimeError: transform: failed to synchronize: cudaErrorIllegalAddress: an illegal memory access was encountered
# Added identdation for better readability
当我使用单个或多个GPU时,会发生此错误。
我该如何处理这个问题?谢谢
这是BERT模型的代码(无段输入,无预训练) c.f.当我使用其他模型时,没有发生此问题。
"""code"""
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
class HateClassification(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.bert = BERT(self.config)
# classfier
self.projection_cls = nn.Linear(self.config.d_hidn, self.config.n_output, bias=False)
self.sigmoid = nn.Sigmoid()
def forward(self, inputs):
inputs = torch.transpose(inputs,0,1)
# (bs, n_enc_seq, d_hidn), (bs, d_hidn), [(bs, n_head, n_enc_seq, n_enc_seq)]
outputs, outputs_cls, attn_probs = self.bert(inputs)
# (bs, n_output)
logits_cls = self.sigmoid(self.projection_cls(outputs_cls))
# (bs, n_output), [(bs, n_head, n_enc_seq, n_enc_seq)]
return logits_cls.squeeze()
""" bert """
class BERT(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.encoder = Encoder(self.config)
self.linear = nn.Linear(config.d_hidn, config.d_hidn)
self.activation = torch.tanh
def forward(self, inputs):
# (bs, n_seq, d_hidn), [(bs, n_head, n_enc_seq, n_enc_seq)]
outputs, self_attn_probs = self.encoder(inputs)
# (bs, d_hidn)
outputs_cls = outputs[:, 0].contiguous()
outputs_cls = self.linear(outputs_cls)
outputs_cls = self.activation(outputs_cls)
# (bs, n_enc_seq, n_enc_vocab), (bs, d_hidn), [(bs, n_head, n_enc_seq, n_enc_seq)]
return outputs, outputs_cls, self_attn_probs
def save(self, epoch, loss, path):
torch.save({
"epoch": epoch,
"loss": loss,
"state_dict": self.state_dict()
}, path)
def load(self, path):
save = torch.load(path)
self.load_state_dict(save["state_dict"])
return save["epoch"], save["loss"]
""" encoder """
class Encoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.enc_emb = nn.Embedding(self.config.n_enc_vocab, self.config.d_hidn)
self.pos_emb = nn.Embedding(self.config.n_enc_seq + 1, self.config.d_hidn)
self.layers = nn.ModuleList([EncoderLayer(self.config) for _ in range(self.config.n_layer)])
def forward(self, inputs):
positions = torch.arange(inputs.size(1), device=inputs.device, dtype=inputs.dtype).expand(inputs.size(0), inputs.size(1)).contiguous() + 1
pos_mask = inputs.eq(self.config.i_pad)
positions.masked_fill_(pos_mask, 0)
# (bs, n_enc_seq, d_hidn)
outputs = self.enc_emb(inputs) + self.pos_emb(positions)
# (bs, n_enc_seq, n_enc_seq)
attn_mask = get_attn_pad_mask(inputs, inputs, self.config.i_pad)
attn_probs = []
for layer in self.layers:
# (bs, n_enc_seq, d_hidn), (bs, n_head, n_enc_seq, n_enc_seq)
outputs, attn_prob = layer(outputs, attn_mask)
attn_probs.append(attn_prob)
# (bs, n_enc_seq, d_hidn), [(bs, n_head, n_enc_seq, n_enc_seq)]
return outputs, attn_probs
""" encoder layer """
class EncoderLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.self_attn = MultiHeadAttention(self.config)
self.layer_norm1 = nn.LayerNorm(self.config.d_hidn, eps=self.config.layer_norm_epsilon)
self.pos_ffn = PoswiseFeedForwardNet(self.config)
self.layer_norm2 = nn.LayerNorm(self.config.d_hidn, eps=self.config.layer_norm_epsilon)
def forward(self, inputs, attn_mask):
# (bs, n_enc_seq, d_hidn), (bs, n_head, n_enc_seq, n_enc_seq)
att_outputs, attn_prob = self.self_attn(inputs, inputs, inputs, attn_mask)
att_outputs = self.layer_norm1(inputs + att_outputs)
# (bs, n_enc_seq, d_hidn)
ffn_outputs = self.pos_ffn(att_outputs)
ffn_outputs = self.layer_norm2(ffn_outputs + att_outputs)
# (bs, n_enc_seq, d_hidn), (bs, n_head, n_enc_seq, n_enc_seq)
return ffn_outputs, attn_prob
""" attention pad mask """
def get_attn_pad_mask(seq_q, seq_k, i_pad):
batch_size, len_q = seq_q.size()
batch_size, len_k = seq_k.size()
pad_attn_mask = seq_k.data.eq(i_pad)
pad_attn_mask= pad_attn_mask.unsqueeze(1).expand(batch_size, len_q, len_k)
return pad_attn_mask
""" multi head attention """
class MultiHeadAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.W_Q = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head)
self.W_K = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head)
self.W_V = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head)
self.scaled_dot_attn = ScaledDotProductAttention(self.config)
self.linear = nn.Linear(self.config.n_head * self.config.d_head, self.config.d_hidn)
self.dropout = nn.Dropout(config.dropout)
def forward(self, Q, K, V, attn_mask):
batch_size = Q.size(0)
# (bs, n_head, n_q_seq, d_head)
q_s = self.W_Q(Q).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2)
# (bs, n_head, n_k_seq, d_head)
k_s = self.W_K(K).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2)
# (bs, n_head, n_v_seq, d_head)
v_s = self.W_V(V).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2)
# (bs, n_head, n_q_seq, n_k_seq)
attn_mask = attn_mask.unsqueeze(1).repeat(1, self.config.n_head, 1, 1)
# (bs, n_head, n_q_seq, d_head), (bs, n_head, n_q_seq, n_k_seq)
context, attn_prob = self.scaled_dot_attn(q_s, k_s, v_s, attn_mask)
# (bs, n_head, n_q_seq, h_head * d_head)
context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.config.n_head * self.config.d_head)
# (bs, n_head, n_q_seq, e_embd)
output = self.linear(context)
output = self.dropout(output)
# (bs, n_q_seq, d_hidn), (bs, n_head, n_q_seq, n_k_seq)
return output, attn_prob
""" feed forward """
class PoswiseFeedForwardNet(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.conv1 = nn.Conv1d(in_channels=self.config.d_hidn, out_channels=self.config.d_ff, kernel_size=1)
self.conv2 = nn.Conv1d(in_channels=self.config.d_ff, out_channels=self.config.d_hidn, kernel_size=1)
self.active = F.gelu
self.dropout = nn.Dropout(config.dropout)
def forward(self, inputs):
# (bs, d_ff, n_seq)
output = self.conv1(inputs.transpose(1, 2))
output = self.active(output)
# (bs, n_seq, d_hidn)
output = self.conv2(output).transpose(1, 2)
output = self.dropout(output)
# (bs, n_seq, d_hidn)
return output
""" scale dot product attention """
class ScaledDotProductAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.dropout = nn.Dropout(config.dropout)
self.scale = 1 / (self.config.d_head ** 0.5)
def forward(self, Q, K, V, attn_mask):
# (bs, n_head, n_q_seq, n_k_seq)
scores = torch.matmul(Q, K.transpose(-1, -2))
scores = scores.mul_(self.scale)
scores.masked_fill_(attn_mask, -1e9)
# (bs, n_head, n_q_seq, n_k_seq)
attn_prob = nn.Softmax(dim=-1)(scores)
attn_prob = self.dropout(attn_prob)
# (bs, n_head, n_q_seq, d_v)
context = torch.matmul(attn_prob, V)
# (bs, n_head, n_q_seq, d_v), (bs, n_head, n_q_seq, n_v_seq)
return context, attn_prob
答案 0 :(得分:0)
下面的代码中您的device.inputs
是什么。
positions = torch.arange(inputs.size(1), device=inputs.device, dtype=inputs.dtype).expand(inputs.size(0), inputs.size(1)).contiguous() + 1
您可以设置特定的GPU并尝试:
torch.cuda.set_device(device_num)
torch.cuda.set_device(0) #for gpu id '0'
我也建议您使用CUDA_LAUNCH_BLOCKING=1
env变量运行代码。 CUDA_LAUNCH_BLOCKING
使cuda报告实际发生的错误。
CUDA_LAUNCH_BLOCKING=1 python your_code.py