我将Spacy用于识别命名实体的模型,Update of the Recognition of the Named Entity文档为我提供了此代码以更新现有模型,此代码仅指定将用作基础的模型,即位置它将存储在何处以及迭代次数
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
from pathlib import Path
# training data
TRAIN_DATA = [
("', 137 S Wilmington Street Raleigh, NC 27601 919.239.4070\t, • Server: Brian 20/1 Guests: 8\t10/01/2018 1:11 PM 20014, L Chicken Arti Pizza\t10.99, Subtotal Tax\t10.99 0.91, Total\t11.90, Balance Due\t11.90, Gratuity Suggestions To Help:, 20% = 2.20 18% = 1.2L,, 115% = 1.65 |f ,9., '?", {"entities": [(3, 19, "ORG")]}),
("Carolina Ale House, G1enwood, 0516 Table 23 #Party 1 JORDYN M SvrCk: 27 7:42p 09/30/18, Separate checks: 3-of-7\t, 2 Carolina Hurrlca\t15.50, 1 Smoked Cheddar Burger\t9.79, Sub Total:\t25.29, Tax:\t2.08, Sub Total:\t27.37, 20X GRATUIT\t5.06, 09/30 10:36pTO TAI : 32\t, D i d you enjoy Every delicious Bite’? Come back to See us and bring your friends*, You are always Welcome at our, House>", {"entities": [(8, 18, "ORG")]}),
(", P~ l-LMl NG *, PRIME STEAKHOUSE 8, WINE BAR, Kalelyh, nr 27612 919-571-6200, Sgj*1® IABIE 51\t6, UlER1 H SvrCk: 5 8:04p 10/02/18, 1\tBlueheny Lemon Drop, ^ Corona, 2\tCraft Beer 2 2 120 Tomahawk 1 Pork Chop 1 Scottish Salmon 4 Prime Dessert, 13.00, 35.00 14.50, 240.00, 40.00, 44.00 0.00, Sub Total: 386.50 „\tTax:\t31.89, 10/02 9:59pTOTAL :\t418.39, www.F1emingsSteakhouse.com ) rials'., Dine Rewards account not attached, Not a Dine Rewards member?, Join now at DINE-REWARDS.COM, ", {"entities": [(17, 35, "ORG")]}),
("Flying Saucer Draught Emporium, 328 Morgan Raleigh, NC, Server: Hope 10/30/7 Guests: 0, 10/04/2018 8:26 PM 20068, L10- Cocktail, 8.00, L10- Classic Daiquiri 1/2 Nacho Libre-r L10- Liqueur, L10- Baily’s Irish Cream L10- Rocks, Subtotal, Tax, Total, 5.50, 8.00, 21.50, 0.45, 21.95, Balance Due\t21., T»p: 3,zT., If you pay with debit card, your bank may hold additional funds temporarily. This is not a charge from Flying Saucer, www. beerknurd .com Taxi Taxi - 919.333.3333", {"entities": [(0, 30, "ORG")]}),
]
@plac.annotations(
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int),
)
def main(model='en_core_web_sm', output_dir=None, n_iter=100):
"""Load the model, set up the pipeline and train the entity recognizer."""
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if "ner" not in nlp.pipe_names:
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
ner = nlp.get_pipe("ner")
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes): # only train NER
# reset and initialize the weights randomly – but only if we're
# training a new model
if model is None:
nlp.begin_training()
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(
texts, # batch of texts
annotations, # batch of annotations
drop=0.5, # dropout - make it harder to memorise data
losses=losses,
)
print("Losses", losses)
# test the trained model
for text, _ in TRAIN_DATA:
doc = nlp(text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
# save model to output directory
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for text, _ in TRAIN_DATA:
doc = nlp2(text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
if __name__ == "__main__":
plac.call(main('en_core_web_sm', Path.cwd(), 100))
执行代码后,他向我显示了这个错误,即使生成了新模型,我也找不到任何参考,但是当我尝试该错误时,我只识别了用作训练的实体(TRAIN_DATA),我也应该已经由Spacy识别了基本模型'en_core_web_sm'的实体。
Traceback (most recent call last): File "train.py", line 105, in <module> plac.call(main('en_core_web_sm', Path.cwd(), 100)) File "C:\ProgramData\Anaconda3\lib\site-packages\plac_core.py", line 324, in call parser = parser_from(obj) File "C:\ProgramData\Anaconda3\lib\site-packages\plac_core.py", line 133, in parser_from parser.populate_from(obj) File "C:\ProgramData\Anaconda3\lib\site-packages\plac_core.py", line 248, in populate_from self._set_func_argspec(func) File "C:\ProgramData\Anaconda3\lib\site-packages\plac_core.py", line 240, in _set_func_argspec self.argspec = getargspec(obj) File "C:\ProgramData\Anaconda3\lib\site-packages\plac_core.py", line 38, in getargspec str(callableobj)) TypeError: Could not determine the signature of None
答案 0 :(得分:0)
我解决了这个问题。
if __name__ == "__main__":
call(main('en_core_web_sm', Path.cwd(), 100))