我正在尝试训练Spacy的en_core_web_lg
模型。
我从官方文档中获得了用于训练新模型的代码。但是我想在en_core_web_lg
模型的顶部进行培训。
代码如下:
from __future__ import unicode_literals, print_function
import sys
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
# training data Start
TRAIN_DATA = [("The model of machine is PC-234w and its serial number is 322424-AGX.", {"entities": [(24, 31, "PRODUCT")]}),("The model of machine is PC-234w and its serial number is 322424-AGX.", {"entities": [(57, 67, "PRODUCT")]})]
#Train data End
def main(model="en_core_web_lg", output_dir=None, n_iter=100):
"""Load the model, set up the pipeline and train the entity recognizer."""
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if "ner" not in nlp.pipe_names:
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
ner = nlp.get_pipe("ner")
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes): # only train NER
# reset and initialize the weights randomly – but only if we're
# training a new model
nlp.begin_training()
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(
texts, # batch of texts
annotations, # batch of annotations
drop=0.5, # dropout - make it harder to memorise data
losses=losses,
)
print("Losses", losses)
# test the trained model
for text, _ in TRAIN_DATA:
doc = nlp(text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
# save model to output directory
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load("en_core_web_lg")
for text, _ in TRAIN_DATA:
doc = nlp2(text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
if __name__ == "__main__":
plac.call(main)
这是输出
Entities [('PC-234w', 'PRODUCT'), ('322424-AGX', 'PRODUCT')]
Tokens [('The', '', 2), ('model', '', 2), ('of', '', 2), ('machine', '', 2), ('is', '', 2), ('PC-234w', 'PRODUCT', 3), ('and', '', 2), ('its', '', 2), ('serial', '', 2), ('number', '', 2), ('is', '', 2), ('322424-AGX', 'PRODUCT', 3), ('.', '', 2)]
但是当我在不同的脚本中运行相同的模型时,它会给我带来不同的NER结果。
代码:
import spacy
nlp = spacy.load("en_core_web_lg")
doc = nlp(u"The model of machine is PC-234w and its serial number is 322424-AGX.")
for ent in doc.ents:
print(ent.text, ent.label_)
输出:
PC-234w ORG
肯定我做错了什么,但我无法弄清楚是什么。