Spacy ValueError:[T001]词组匹配的当前最大长度为10

时间:2018-07-02 15:03:06

标签: python-3.x annotations nlp spacy

是否可以将匹配的词组的最大长度更改为更大的数字? 即使最大长度设置为较高的阈值。 我找不到上一个实用的解决方案。线程。 标记的术语超过9000。

import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

class EntityMatcher(object):
    name = 'entity_matcher'

    def __init__(self, nlp, terms, label):
        patterns = [nlp(text) for text in terms]
        self.matcher = PhraseMatcher(nlp.vocab, max_length=20000)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        for match_id, start, end in matches:
            span = Span(doc, start, end, label=match_id)
            doc.ents = list(doc.ents) + [span]
        return doc


nlp = spacy.blank('en')
text = open('/Users/Desktop/9000drugs.txt').read()
drugs = text.splitlines()
print (drugs)

entity_matcher = EntityMatcher(nlp, drugs, 'DRUG')

nlp.add_pipe(entity_matcher)
text = open('/Users/Desktop/extract_text.txt', 'r').read() # open a document
doc = nlp(text)


ents = [( e.start_char, e.end_char, e.label_) for e in doc.ents]

a ='{}'.format(doc.text)
annotation = (a, {'entities': ents})

print(annotation)

0 个答案:

没有答案