我得到了这个结果。 BERT无法预测这#个标签字。这个X应该是DRUG。
我正在使用pytorch_pretrained_bert库。我从这里利用代码:
https://github.com/Louis-udm/NER-BERT-CRF
Word in BERT layer | Initial word : Predicted NER-tag
-------------------------------------------------------------
holy | holy : O
shit | shit : O
that | that : O
##one | trazodone : X
actually | actually : O
knocked | knocked : B-ADR
me | me : I-ADR
the | the : I-ADR
fuck | fuck : I-ADR
out | out : I-ADR
and | and : O
took | took : O
me | me : O
for | for : O
a | a : O
ride | ride : O
答案 0 :(得分:0)
代码如下:
class PaddingInputExample(object):
""" Fake example so the num input examples is a multiple of the batch size.
When running eval/predict on the TPU, we need to pad the number of examples
to be a multiple of the batch size, because the TPU requires a fixed batch
size. The alternative is to drop the last batch, which is bad because it means
the entire output data won't be generated.
We use this class instead of `None` because treating `None` as padding
battches could cause silent errors. """
def convert_text_to_examples(texts, labels):
"""Create InputExamples"""
InputExamples = []
for text, label in zip(texts, labels):
InputExamples.append(
InputExample(guid=None, words=text, labels=label)
)
return InputExamples
def convert_examples_to_features(tokenizer, examples, max_seq_length=66):
"""Convert a set of `InputExample`s to a list of `InputFeatures`."""
input_ids, input_masks, segment_ids, labels = [], [], [], []
for example in tqdm_notebook(examples, desc="Converting examples to features"):
input_id, input_mask, segment_id, label = convert_single_example(
tokenizer, example, max_seq_length
)
input_ids.append(input_id)
input_masks.append(input_mask)
segment_ids.append(segment_id)
labels.append(label)
return (
np.array(input_ids),
np.array(input_masks),
np.array(segment_ids),
np.array(labels),
)
def convert_single_example(tokenizer, example, max_seq_length=256):
"""Converts a single `InputExample` into a single `InputFeatures`."""
if isinstance(example, PaddingInputExample):
input_ids = [0] * max_seq_length
input_mask = [0] * max_seq_length
segment_ids = [0] * max_seq_length
label_ids = [0] * max_seq_length
return input_ids, input_mask, segment_ids, label_ids
# pdb.set_trace()
tokens_a = example.words
if len(tokens_a) > max_seq_length-2:
tokens_a = tokens_a[0 : (max_seq_length-2)]
# Token map will be an int -> int mapping between the `orig_tokens` index and
# the `bert_tokens` index.
# bert_tokens == ["[CLS]", "john", "johan", "##son", "'", "s", "house", "[SEP]"]
# orig_to_tok_map == [1, 2, 4, 6]
orig_to_tok_map = []
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
orig_to_tok_map.append(len(tokens)-1)
#print(len(tokens_a))
for token in tokens_a:
tokens.extend(tokenizer.tokenize(token))
orig_to_tok_map.append(len(tokens)-1)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
orig_to_tok_map.append(len(tokens)-1)
input_ids = tokenizer.convert_tokens_to_ids([tokens[i] for i in orig_to_tok_map])
#print(len(orig_to_tok_map), len(tokens), len(input_ids), len(segment_ids)) #for debugging
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
label_ids = []
labels = example.labels
label_ids.append(0)
label_ids.extend([tag2int[label] for label in labels])
label_ids.append(0)
#print(len(label_ids)) #for debugging
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
label_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
assert len(label_ids) == max_seq_length
return input_ids, input_mask, segment_ids, label_ids
test_example = convert_text_to_examples([sentence_ini], [['O']*len(sentence_ini)])
(input_ids, input_masks, segment_ids, _ ) = convert_examples_to_features(tokenizer, test_example,max_seq_length)
input_ids = input_ids[0]
input_masks = input_masks[0]
segment_ids = segment_ids[0]
input_ids = torch.tensor([input_ids])
input_masks = torch.tensor([input_masks])
segment_ids = torch.tensor([segment_ids])
model.eval()
with torch.no_grad():
# Predict hidden states features for each layer
predictions= model(input_ids,segment_ids, input_masks)
_, predicted = torch.max(predictions[0], -1)
print("\n{:20}| {:15}: {:15}".format("Word in BERT layer", 'Initial word', "Predicted NER-tag"))
print(61*'-')
k = 0
for i, pred in enumerate(predicted):
# print(pred)
try:
if pred.item()!=1:
print("{:20}| {:15}: {:15}".format([tokens[i] for i in orig_to_tok_map][i], sentence_ini[i-1], int2tag[pred.item()]))
k+=1
except:
pass