我的目标是训练跨度预测模型
可以预测BERT输出序列中的位置
我输入的形状是(batch_size,max_sequence_len(512),embeddding_size(768))
输出的形状将为(batch_size,max_sequence_len,1),第三个暗角代表某种可能性,然后将输出形状调整为(batch_size,max_sequence_len)
我的标签的形状为(batch_size,max_sequence_len),在max_sequence_len(512)中,只有一个位置将为1,其他位置将为零
我已经检查过了
(batch_size is 2)
start_pos_labels.sum(dim=1)
output >>
tensor([1.0000, 1.0000], device='cuda:0', dtype=torch.float64)
start_pred.sum(dim=1)
tensor([1., 1.], device='cuda:0', dtype=torch.float64, grad_fn=<SumBackward1>)
但是当我使用nn.KLDivLoss()时,输出仍然为负,我真的不知道为什么
有人可以帮我吗?谢谢!
这是我的代码 型号代码
class posClassfication_new(nn.Module):
def __init__(self):
super(posClassfication_new, self).__init__()
self.start_task = nn.Sequential(
nn.Linear(768, 1),
# nn.ReLU(),
# nn.Linear(256, 128),
# nn.ReLU(),
# nn.Linear(128, 1)
)
self.end_task = nn.Sequential(
nn.Linear(768, 1),
# nn.ReLU(),
# nn.Linear(256, 128),
# nn.ReLU(),
# nn.Linear(128, 1)
)
#
def forward(self, start_x,end_x):
start_x = start_x.double()
end_x = end_x.double()
start_out = self.start_task(start_x)
end_out = self.end_task(end_x)
return start_out,end_out
培训代码
BATCH_SIZE = 8
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
class PosTrainDataset(Dataset):
def __init__(self, x, start_y,end_y):
self.x = x
self.start_y = start_y
self.end_y = end_y
def __getitem__(self,idx):
x = self.x[idx]
start_y = self.start_y[idx]
end_y = self.end_y[idx]
return x, start_y, end_y
def __len__(self):
return len(self.x)
trainset = PosTrainDataset(pos_train_x , start_pos_labels_train , end_pos_labels_train)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE)
pos_model = posClassfication_new()
pos_model = pos_model.to(device)
pos_model = pos_model.double()
pos_model.train()
pos_loss = nn.KLDivLoss()
# pos_loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(pos_model.parameters(), lr=1e-5)
EPOCHS = 5
for epoch in range(EPOCHS):
running_loss = 0.0
for data in trainloader:
x, start_pos_labels, end_pos_labels = [t.to(device) for t in data]
mini_batch = x.size()[0]
optimizer.zero_grad()
start_pred , end_pred = pos_model(x,x)
start_pred = start_pred.reshape((mini_batch,512))
end_pred = end_pred.reshape((mini_batch,512))
start_pred = torch.nn.functional.softmax(start_pred,dim=1)
end_pred = torch.nn.functional.softmax(end_pred,dim=1)
start_pos_labels = start_pos_labels + 0.0001
start_pos_labels = torch.nn.functional.softmax(start_pos_labels,dim=1)
end_pos_labels = end_pos_labels + 0.0001
end_pos_labels = torch.nn.functional.softmax(end_pos_labels,dim=1)
# start_pos_labels = torch.argmax(start_pos_labels,dim=1)
# end_pos_labels = torch.argmax(end_pos_labels,dim=1)
start_loss = pos_loss(start_pred,start_pos_labels)
end_loss = pos_loss(end_pred,end_pos_labels)
loss = start_loss + end_loss
loss.backward()
optimizer.step()
running_loss += loss.item()
torch.save(pos_model,'pos_model_single_task.pkl')
print('[epoch %d] loss: %.3f' %(epoch + 1, running_loss))
答案 0 :(得分:0)
nn.KLDivLoss
期望输入为对数概率。
从文档中:
与
NLLLoss
一样,给定的输入应包含 log-概率,并且不限于2D张量。目标以概率的形式给出(即,不取对数)。
您可以将nn.functional.log_softmax
应用于您的预测以获得对数概率。
start_pred = torch.nn.functional.log_softmax(start_pred,dim=1)
end_pred = torch.nn.functional.log_softmax(end_pred,dim=1)