BERT模型不学习新任务

时间:2019-06-26 09:53:34

标签: python python-3.x tensorflow bert-language-model

我正在尝试对亚马逊评论数据集上的预训练BERT模型进行微调。为此,我通过以下处理器扩展了run_classifier文件:

class AmazonProcessor(DataProcessor):
  """Processor for the Amazon data set."""

  def get_train_examples(self, data_dir):
    """See base class."""
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

  def get_dev_examples(self, data_dir):
    """See base class."""
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

  def get_test_examples(self, data_dir):
    """See base class."""
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

  def get_labels(self):
    """See base class."""
    return ["0", "1", "2"]

  def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      # header
      if i == 0:
        continue
      guid = "%s-%s" % (set_type, i)
      text_a = tokenization.convert_to_unicode(line[13])
      label = tokenization.convert_to_unicode(line[7])
      # only train on 3 labels instead of 5
      if int(label) <= 2: label = "0"
      if int(label) == 3: label = "1"
      if int(label) >= 4: label = "2"
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
      return examples

我正在用colab笔记本在GPU上进行培训,因此我也根据自己的需要调整了主要方法:

processors = {
  "cola": run_classifier.ColaProcessor,
  "mnli": run_classifier.MnliProcessor,
  "mrpc": run_classifier.MrpcProcessor,
  "xnli": run_classifier.XnliProcessor,
  "amazon": run_classifier.AmazonProcessor,
}

bert_config_file = os.path.join(BERT_FOLDER, "bert_config.json")
max_seq_length = 128
output_dir = "drive/My Drive/model"
task_name = "amazon"
vocab_file = os.path.join(BERT_FOLDER, "vocab.txt")
do_lower_case = False
master = None
tpu_cluster_resolver = None
save_checkpoints_steps = 1000
iterations_per_loop = 1000
use_tpu = False
data_dir  = "drive/My Drive/csv_dataset"
learning_rate = 5e-5
warmup_proportion = 0.1
train_batch_size = 16
eval_batch_size = 1
predict_batch_size = 1
num_train_epochs = 10.0
num_train_steps = 10000
num_tpu_cores = 8
#init_checkpoint = os.path.join(BERT_FOLDER, "bert_model.ckpt")
init_checkpoint = "drive/My Drive/model2/model.ckpt-41000"

do_train = True
do_eval = True

tokenization.validate_case_matches_checkpoint(do_lower_case, init_checkpoint)


bert_config = modeling.BertConfig.from_json_file(bert_config_file)
print(bert_config)

task_name = task_name.lower()

processor = processors[task_name]()

label_list = processor.get_labels()

tokenizer = tokenization.FullTokenizer(
  vocab_file=vocab_file, do_lower_case=do_lower_case)

is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
  cluster=tpu_cluster_resolver,
  master=master,
  model_dir=output_dir,
  save_checkpoints_steps=save_checkpoints_steps,
  tpu_config=tf.contrib.tpu.TPUConfig(
      iterations_per_loop=iterations_per_loop,
      num_shards=num_tpu_cores,
      per_host_input_for_training=is_per_host))

train_examples = None
num_train_steps = None
num_warmup_steps = None
if do_train:
  train_examples = processor.get_train_examples(data_dir)
  num_train_steps = int(
      len(train_examples) / train_batch_size * num_train_epochs)
  num_warmup_steps = int(num_train_steps * warmup_proportion)

model_fn = run_classifier.model_fn_builder(
  bert_config=bert_config,
  num_labels=len(label_list),
  init_checkpoint=init_checkpoint,
  learning_rate=learning_rate,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=use_tpu,
  use_one_hot_embeddings=use_tpu)

estimator = tf.contrib.tpu.TPUEstimator(
  use_tpu=use_tpu,
  model_fn=model_fn,
  config=run_config,
  train_batch_size=train_batch_size,
  eval_batch_size=eval_batch_size,
  predict_batch_size=predict_batch_size)

if do_train:
  train_file = os.path.join(output_dir, "train.tf_record")
  run_classifier.file_based_convert_examples_to_features(
      train_examples, label_list, max_seq_length, tokenizer, train_file)
  tf.logging.info("***** Running training *****")
  tf.logging.info("  Num examples = %d", len(train_examples))
  tf.logging.info("  Batch size = %d", train_batch_size)
  tf.logging.info("  Num steps = %d", num_train_steps)
  train_input_fn = run_classifier.file_based_input_fn_builder(
      input_file=train_file,
      seq_length=max_seq_length,
      is_training=True,
      drop_remainder=True)
  estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

if do_eval:
  eval_examples = processor.get_test_examples(data_dir)
  num_actual_eval_examples = len(eval_examples)
  if use_tpu:
    # TPU requires a fixed batch size for all batches, therefore the number
    # of examples must be a multiple of the batch size, or else examples
    # will get dropped. So we pad with fake examples which are ignored
    # later on. These do NOT count towards the metric (all tf.metrics
    # support a per-instance weight, and these get a weight of 0.0).
    while len(eval_examples) % eval_batch_size != 0:
      eval_examples.append(PaddingInputExample())

  eval_file = os.path.join(output_dir, "eval.tf_record")
  run_classifier.file_based_convert_examples_to_features(
      eval_examples, label_list, max_seq_length, tokenizer, eval_file)

  tf.logging.info("***** Running evaluation *****")
  tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                  len(eval_examples), num_actual_eval_examples,
                  len(eval_examples) - num_actual_eval_examples)
  tf.logging.info("  Batch size = %d", eval_batch_size)

  # This tells the estimator to run through the entire set.
  eval_steps = None
  # However, if running eval on the TPU, you will need to specify the
  # number of steps.
  if use_tpu:
    assert len(eval_examples) % eval_batch_size == 0
    eval_steps = int(len(eval_examples) // eval_batch_size)

  eval_drop_remainder = True if use_tpu else False
  eval_input_fn = run_classifier.file_based_input_fn_builder(
      input_file=eval_file,
      seq_length=max_seq_length,
      is_training=False,
      drop_remainder=eval_drop_remainder)

  result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

  output_eval_file = os.path.join(output_dir, "eval_results.txt")
  with tf.gfile.GFile(output_eval_file, "w") as writer:
    tf.logging.info("***** Eval results *****")
    for key in sorted(result.keys()):
      tf.logging.info("  %s = %s", key, str(result[key]))
      writer.write("%s = %s\n" % (key, str(result[key])))

我知道这是很多代码,但是因为我无法指出错误,所以我想展示所有错误。

请注意,大多数日志记录输出似乎完全合理:

例如一个转换后的例子:

INFO:tensorflow:tokens: [CLS] Ich habe schon viele Klavier ##kon ##zer ##te gehört , aber was Frau Martha Ar ##geri ##ch hier spielt lässt einem ge ##wis ##ser ##ma ##ßen den At ##em stock ##en . So geni ##al habe ich diese 2 Klavier ##kon ##zer ##te von Ra ##ch ##mani ##no ##ff und T ##sch ##aik ##ov ##sky noch nie gehört . Sie ent ##fes ##selt einen regel ##rechte ##n Feuer ##stu ##rm an Vir ##tu ##osi ##tät . [SEP]
INFO:tensorflow:input_ids: 101 21023 21404 16363 18602 48021 17423 14210 10216 16706 117 11566 10134 16783 26904 18484 68462 10269 13329 28508 25758 10745 46503 83648 12754 10369 20284 10140 11699 10451 20511 10136 119 12882 107282 10415 21404 12979 12750 123 48021 17423 14210 10216 10166 38571 10269 31124 10343 13820 10130 157 12044 106333 11024 16116 11230 11058 16706 119 11583 61047 58058 26063 10897 46578 55663 10115 68686 19987 19341 10151 106433 10991 20316 24308 119 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:label: 2 (id = 2)

或者从检查点文件中加载模型:

INFO:tensorflow:  name = output_weights:0, shape = (3, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = output_bias:0, shape = (3,), *INIT_FROM_CKPT*

但是最后,eval_accuracy始终保持不变:

I0625 15:46:41.328946   eval_accuracy = 0.3338616

完整的存储库可以在这里找到:https://github.com/joroGER/bert/

还有笔记本的要点:https://colab.research.google.com/gist/joroGER/75c1c9c6383f0199bb54ce7b63d412d0/untitled4.ipynb

0 个答案:

没有答案