我正在尝试微调BERT以进行多标签分类。我有自己的数据处理器,并且使用了预训练的BERT。我为我的任务在预训练的BERT的末尾添加了一个微调层。
我有一个create model
函数,在现有BERT的末尾添加了一个微调层。
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
labels, num_labels, use_one_hot_embeddings):
"""Creates a classification model."""
model = modeling.BertModel(
config=bert_config,
is_training=is_training,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=segment_ids,
use_one_hot_embeddings=use_one_hot_embeddings)
# In the demo, we are doing a simple classification task on the entire
# segment.
#
# If you want to use the token-level output, use model.get_sequence_output()
# instead.
output_layer = model.get_pooled_output()
hidden_size = output_layer.shape[-1].value
output_weights = tf.get_variable(
"output_weights", [num_labels, hidden_size],
initializer=tf.truncated_normal_initializer(stddev=0.02))
output_bias = tf.get_variable(
"output_bias", [num_labels], initializer=tf.zeros_initializer())
with tf.variable_scope("loss"):
if is_training:
# I.e., 0.1 dropout
output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
logits = tf.matmul(output_layer, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
# print("labels:",labels,";logits:",logits,"isinstance(labels,list):",isinstance(labels,list))
# mulit-label classification: 1.multi-hot==> then use sigmoid to transform it to possibility
probabilities = tf.nn.sigmoid(logits)
# log_probs=tf.log(probabilities)
labels = tf.cast(labels, tf.float32)
# below is for single label classification
# one-hot for single label classification
# probabilities = tf.nn.softmax(logits, axis=-1)
# log_probs = tf.nn.log_softmax(logits, axis=-1)
# one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
tf.logging.debug("num_labels = %s; logits = %s; labels = %s" % (num_labels, logits, labels))
# print("log_probs:",log_probs)
# per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) # 利用交叉熵就和
per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)
loss = tf.reduce_mean(per_example_loss)
return (loss, per_example_loss, logits, probabilities)
我在model_fn_builder中使用它,其中返回的logit用于估计器中
def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
num_train_steps, num_warmup_steps, use_tpu,
use_one_hot_embeddings):
"""Returns `model_fn` closure for TPUEstimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
tf.logging.info("*** Features ***")
for name in sorted(features.keys()):
tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape))
input_ids = features["input_ids"]
input_mask = features["input_mask"]
segment_ids = features["segment_ids"]
label_ids = features["label_ids"]
is_real_example = None
if "is_real_example" in features:
is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
else:
is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
(total_loss, per_example_loss, logits, probabilities) = create_model(
bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
num_labels, use_one_hot_embeddings)
tvars = tf.trainable_variables()
initialized_variable_names = {}
scaffold_fn = None
if init_checkpoint:
(assignment_map, initialized_variable_names
) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
if use_tpu:
def tpu_scaffold():
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
return tf.train.Scaffold()
scaffold_fn = tpu_scaffold
else:
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
tf.logging.info("**** Trainable Variables ****")
for var in tvars:
init_string = ""
if var.name in initialized_variable_names:
init_string = ", *INIT_FROM_CKPT*"
tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape,
init_string)
output_spec = None
if mode == tf.estimator.ModeKeys.TRAIN:
train_op = optimization.create_optimizer(
total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
logging_hook = tf.train.LoggingTensorHook({"loss": total_loss,'precision:': t_precision,'recall:': t_recall}, every_n_iter=10)
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
mode=mode,
loss=total_loss,
train_op=train_op,
training_hooks=[logging_hook],
scaffold_fn=scaffold_fn)
elif mode == tf.estimator.ModeKeys.EVAL:
def metric_fn(per_example_loss, label_ids, logits, is_real_example):
# print("###metric_fn.logits:",logits.shape) # (?,80)
# predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
# print("###metric_fn.label_ids:",label_ids.shape,";predictions:",predictions.shape) # label_ids: (?,80);predictions:(?,)
print(logits)
logits_split = tf.split(logits, args.num_classes,
axis=-1) # a list. length is num_classes
label_ids_split = tf.split(logits, args.num_classes,
axis=-1) # a list. length is num_classes
accuracy = tf.constant(0.0, dtype=tf.float64)
for j, logits in enumerate(logits_split): #
# accuracy = tf.metrics.accuracy(label_ids, predictions)
label_id_ = tf.cast(label_ids_split[j], dtype=tf.int32)
current_auc, update_op_auc = tf.metrics.auc(label_id_, logits)
# TP = tf.count_nonzero(logits * label_id_)
# TN = tf.count_nonzero((logits - 1) * (label_id_ - 1))
# FP = tf.count_nonzero(logits * (label_id_ - 1))
# FN = tf.count_nonzero((logits - 1) * label_id_)
# current_precision,update_op_precision = tf.metrics.Precision(label_id_, logits)
# current_recall,update_op_recall = tf.metrics.Recall(label_id_, logits)
prec,prec_op=precision(label_id_,logits)
rec,rec_op=recall(label_id_,logits)
f_1=f1(label_id_,logits)
eval_loss = tf.metrics.mean(values=per_example_loss)
return {
"eval_precision":(prec,prec_op),
"eval_recall" : (rec_op,rec_op),
"eval_auc" : (current_auc, update_op_auc),
"eval_loss": eval_loss,
}
eval_metrics = (metric_fn,
[per_example_loss, label_ids, logits, is_real_example])
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
mode=mode,
loss=total_loss,
eval_metrics=eval_metrics,
scaffold_fn=scaffold_fn)
else:
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
mode=mode,
predictions={"probabilities": probabilities},
scaffold_fn=scaffold_fn)
return output_spec
return model_fn
在我的model_fn
中,当估算器处于评估模式时,我使用对数来计算metric_fn
中定义的各种指标(在model_fn_builder
中定义)
我在回溯中遇到以下错误:
ERROR:tensorflow:Error recorded from evaluation_loop: 2 root error(s) found.
(0) Invalid argument: assertion failed: [`predictions` contains negative values] [Condition x >= 0 did not holdelement-wise:] [x (Reshape:0) = ] [0 -1 -2...]
[[node confusion_matrix/assert_non_negative_1/assert_less_equal/Assert/AssertGuard/Assert (defined at /home/aditya_vartak/virtualenvs/anaconda3/envs/tf1/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1748) ]]
[[confusion_matrix_2/ones_like/_1429]]
(1) Invalid argument: assertion failed: [`predictions` contains negative values] [Condition x >= 0 did not holdelement-wise:] [x (Reshape:0) = ] [0 -1 -2...]
[[node confusion_matrix/assert_non_negative_1/assert_less_equal/Assert/AssertGuard/Assert (defined at /home/aditya_vartak/virtualenvs/anaconda3/envs/tf1/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1748) ]]
0 successful operations.
0 derived errors ignored.
我知道错误是由于logits中的负值引起的。我的问题是为什么?什么是解决方法?
编辑1 :如果问题不明确,我想补充一下,我确实对形状为[hidden_dimension,num_classes]的预训练BERT的最后一层的加权和应用了S型激活。存储在probablities
中,然后应用sigmoid_cross_entropy_with_logits
。(如create_model()
中的显示)。根据{{3}},它为每个输入返回介于0,1之间的值。那么,概率如何获得负值呢?我觉得问题出在metric_fn()
中。但不知道到底是什么