BERT微调:多分类中的高损失和低准确性

时间:2020-07-02 20:26:57

标签: pytorch huggingface-transformers pytorch-lightning

虽然使用微调的Bert进行二进制分类的效果很好,但我仍然坚持使用多类分类。我的数据集(10个类别的德国新闻文章)包含大约10.000个样本。 虽然,训练损失和平均评估损失约为2.2。

一些NLP配置变量:

DEBUG=True
VERSION = 1
MAX_LEN = 200 #Set the maximum length according to the diagram above
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-4
MOMENTUM = 0.9
TRAIN_SIZE = 0.7
NUM_LABELS = len(df_data.Labels.unique())
MODEL_NAME = "dbmdz/bert-base-german-cased"
params = {"debug": DEBUG, "max_len": MAX_LEN, "batch_size": BATCH_SIZE, "epochs": EPOCHS, "lr": LEARNING_RATE, "momentum": MOMENTUM, "model": MODEL_NAME, "loss": "BCEWithLogitsLoss", "optimizer": "SGD"}
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)

DataLoader:

class NLPDataset(th.utils.data.Dataset):

  def __init__(self, dataframe, tokenizer, max_len):
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.data = dataframe
    self.text = dataframe.Text
    self.targets = dataframe.Labels
    self.len = len(self.text)

  def __getitem__(self,idx):
    text = str(self.text[idx])
    text = " ".join(text.split())

    inputs = self.tokenizer(
        text,
        None,
        add_special_tokens=True,
        max_length=self.max_len,
        pad_to_max_length=True,
        return_token_type_ids=True,
        truncation=True,
        padding='max_length',
    )
    input_ids = inputs['input_ids']
    mask = inputs['attention_mask']
    token_type_ids = inputs["token_type_ids"]

    return {
        'input_ids': th.tensor(input_ids, dtype=th.long),
        'mask': th.tensor(mask, dtype=th.float),
        'token_type_ids': th.tensor(token_type_ids, dtype=th.long),
        'targets': th.tensor(self.targets[idx], dtype=th.long)
    }
  
  def __len__(self):
    return self.len

Pytorch_lightning模块:

class NLPClassifier(pl.LightningModule):

  def __init__(self):
      super().__init__()

      #changing the configuration to 10 lables instead of 2
      config = transformers.AutoConfig.from_pretrained(MODEL_NAME)
      config.num_labels = NUM_LABELS
      self.model = transformers.AutoModelForSequenceClassification.from_config(config)
      self.loss = th.nn.CrossEntropyLoss(reduction="none")

  def prepare_data(self):
      # train/val split
      train_dataset = df_data.sample(frac=TRAIN_SIZE)
      val_dataset=df_data.drop(train_dataset.index).sample(frac=1).reset_index(drop=True)
      train_dataset = train_dataset.reset_index(drop=True)

      # Assign CustomDataset Class
      train_set = NLPDataset(train_dataset, tokenizer, MAX_LEN)
      val_set = NLPDataset(val_dataset, tokenizer, MAX_LEN)

      print("FULL Dataset: {}".format(df_data.shape))
      print("TRAIN Dataset: {}".format(train_dataset.shape))
      print("VAL Dataset: {}".format(val_dataset.shape))

      # assign to use in dataloaders
      self.train_ds = train_set
      self.val_ds = val_set
      #self.test_dataset = mnist_test DO TO
  
  def forward(self, input_ids, mask):
      logits, = self.model(input_ids, mask)
      # logits.shape: (16, 10)
      return logits
  
  def training_step(self, batch, batch_idx):
    logits = self.forward(batch['input_ids'], batch['mask']).squeeze()
    loss = self.loss(logits, batch['targets']).mean()
    run.log(name='train_loss', value=loss.tolist())
    return {'loss': loss, 'log': {'train_loss': loss}}
  
  def validation_step(self,batch, batch_idx):
    logits = self.forward(batch['input_ids'], batch['mask']).squeeze()
    print(logits.shape)
    acc = (logits.argmax(-1) == batch['targets']).float()
    loss = self.loss(logits, batch['targets'])
    run.log_list('loss', loss.tolist())
    run.log_list('acc', acc.tolist())
    return {'loss': loss, 'acc': acc}

  def validation_epoch_end(self, outputs):
    loss = th.cat([o['loss'] for o in outputs], 0).mean()
    acc = th.cat([o['acc'] for o in outputs], 0).mean()
    out = {'val_loss': loss, 'val_acc': acc}
    run.log('val_loss', loss.tolist())
    run.log('val_acc', acc.tolist())
    return {**out, 'log': {'val_loss': loss, 'val_acc': acc}}
    
  def train_dataloader(self):
      return th.utils.data.DataLoader(
          self.train_ds,
          batch_size=BATCH_SIZE,
          num_workers=8,
          drop_last=True,
          shuffle=False,
      )
  
  def val_dataloader(self):
      return th.utils.data.DataLoader(
          self.val_ds,
          batch_size=BATCH_SIZE,
          num_workers=8,
          drop_last=False,
          shuffle=False,
      )

  
  def configure_optimizers(self):
      return transformers.AdamW(
            self.model.parameters(),
            lr=LEARNING_RATE,
            #momentum=MOMENTUM,
        )

教练:

model = NLPClassifier()
trainer = pl.Trainer(
   gpus=(1 if th.cuda.is_available() else 0),
   default_root_dir = f"./models/version_{VERSION}",
   max_epochs=EPOCHS,
   fast_dev_run=DEBUG,
   limit_train_batches=1.0,
   val_check_interval=0.5,
   limit_val_batches=1.0,
   profiler=True,
   #logger=wandb_logger
   )

trainer.fit(模型)

这是样品损失曲线。

Sample loss curve

我的主要问题是:

  1. CrossEntropyLoss是否正确使用?
  2. 优化程序是否起作用,因为每个样本的预测很快就会变得相同。
  3. 学习率问题未能解决问题。我尝试了从1e-2到1e-6的范围

感谢您的帮助。 :)

0 个答案:

没有答案