是否可以将匹配的词组的最大长度更改为更大的数字? 即使最大长度设置为较高的阈值。 我找不到上一个实用的解决方案。线程。 标记的术语超过9000。
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
class EntityMatcher(object):
name = 'entity_matcher'
def __init__(self, nlp, terms, label):
patterns = [nlp(text) for text in terms]
self.matcher = PhraseMatcher(nlp.vocab, max_length=20000)
self.matcher.add(label, None, *patterns)
def __call__(self, doc):
matches = self.matcher(doc)
for match_id, start, end in matches:
span = Span(doc, start, end, label=match_id)
doc.ents = list(doc.ents) + [span]
return doc
nlp = spacy.blank('en')
text = open('/Users/Desktop/9000drugs.txt').read()
drugs = text.splitlines()
print (drugs)
entity_matcher = EntityMatcher(nlp, drugs, 'DRUG')
nlp.add_pipe(entity_matcher)
text = open('/Users/Desktop/extract_text.txt', 'r').read() # open a document
doc = nlp(text)
ents = [( e.start_char, e.end_char, e.label_) for e in doc.ents]
a ='{}'.format(doc.text)
annotation = (a, {'entities': ents})
print(annotation)