Question

我是spacy和python的新手，我正在使用python和nltk训练我自己的spacy模型，这是我的代码，我训练数据和测试数据，如果我提供与文本数据相同的测试数据，输出是正确的我无法识别超过2个标签，每次我编译代码标签识别是不同的和不正确的，我已经提到spacy网站，但我无法得到解决方案。请帮帮我!!

from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy


# new entity label

# training data
# Note: If you're using an existing model, make sure to mix in examples of
# other entity types that spaCy correctly recognized before. Otherwise, your
# model might learn the new type, but "forget" what it previously knew.
# https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
TRAIN_DATA = [

    ("Duck quacks, Dog barks", {
        'entities': [(0,4,'Bird'), (13,16,'Animal')]
    }),

    ("Duck eats fish, Dog eats meat", {
        'entities': [(0,4,'Bird'), (16,19,'Animal')]
    }),

    ("Duck eats fish, Dog eats meat", {
        'entities': [(0,4,'Bird'), (16,19,'Animal')]
    })


]


@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    new_model_name=("New model name for model meta.", "option", "nm", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
def main(model=None, new_model_name='Animal', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new 
     entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    # add new entity label to entity recognizer
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])
            print("Label '%s'" % ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # test the trained model
    test_text = 'Duck eats Nippot, Dog eats meat'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)


if __name__ == '__main__':
    plac.call(main)

Answer 1

更改是脚本中的优化程序代码。另外，避免添加重复的标签，这样可以创建标签列表，然后通过ner.add_label添加标签。

TRAIN_DATA = [
    ("Duck quacks, Dog barks", {
        'entities': [(0,4,'Bird'), (13,16,'Animal')]
    }),

    ("Duck eats fish, Dog eats meat", {
        'entities': [(0,4,'Bird'), (16,19,'Animal')]
    }),

    ("Duck eats fish, Dog eats meat", {
        'entities': [(0,4,'Bird'), (16,19,'Animal')]
    })


    ]

     label_ = ['Bird', 'Animal']

    @plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    new_model_name=("New model name for model meta.", "option", "nm", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))

    def main(model=None, new_model_name='Animal', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new 
     entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    # add new entity label to entity recognizer
    #     for _, annotations in TRAIN_DATA:
    #         for ent in annotations.get('entities'):
    #             ner.add_label(ent[2])
    #             print("Label '%s'" % ent[2])
    for LABEL in label_: # add new entity label to entity recognizer
        ner.add_label(LABEL) # this way you avoid adding duplicate labels.

    if model is None:
        optimizer = nlp.begin_training()
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
    else:
        optimizer = nlp.entity.create_optimizer()


    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
    #         optimizer = nlp.begin_training() # made changes above for the same
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # test the trained model
    test_text = 'Duck and Dog eats Nippot, Dog eats meat'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

输出：

 Entities in 'Duck and Dog eats Nippot, Dog eats meat'
    Bird Duck
    Animal Dog
    Animal Dog

无法使用python识别spacy中的两个或多个标签

1 个答案: