我正试图训练一个自定义的NER模型。最初,我安装了最新的spacy版本,但在培训期间出现以下错误
ValueError:[E103]试图设置有冲突的文档:令牌只能是一个实体的一部分,因此请确保要设置的实体不重叠。
之后,我安装了spacy版本spacy==2.0.11
并尝试运行我的代码。当我要训练大约10行数据时,该模型可以正常工作,并且将其保存到我的输出目录中。但是,当有更多数据(5K行)作为原始训练数据时,我的jupyter内核死了,或者当我在spyder中运行时,控制台就存在了!
我知道已弃用的spacy
版本不会引发值错误,但是由于我无法训练模型而仍然没有用。
样本数据:
CarryBag 09038820815c.txt
Stopperneedle 0903882080f4.txt
Foilbags 09038820819.txt
我大约有700个这样的文件,带有要标记的数据,并且在每个文件中,多个实体都需要标记。 参考代码:
import spacy
# import en_core_web_sm
import re
import csv
from spacy.matcher import PhraseMatcher
import plac
from pathlib import Path
import random
#Function to convert PhraseMatcher return value to string indexes
def str_index_conversion(lbl, doc, matchitem):
o_one = len(str(doc[0:matchitem[1]]))
subdoc = doc[matchitem[1]:matchitem[2]]
o_two = o_one + len(str(subdoc))
return (o_one, o_two, lbl)
# nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)
else:
ner = nlp.get_pipe('ner')
ner.add_label('PRODUCT')
DIR = 'D:/Docs/'
matcher = PhraseMatcher(nlp.vocab)
list_str_index = []
to_train_ents = []
with open(r'D:\ner_dummy_pack.csv', newline='', encoding ='utf-8') as myFile:
reader = csv.reader(myFile)
for row in reader:
try:
product = row[0].lower()
#print('K---'+ product)
filename = row[1]
file = open(DIR+filename, "r", encoding ='utf-8')
print(file)
filecontents = file.read()
for s in filecontents:
filecontents = re.sub(r'\s+', ' ', filecontents)
filecontents = re.sub(r'^https?:\/\/.*[\r\n]*', '', filecontents, flags=re.MULTILINE)
filecontents = re.sub(r"http\S+", "", filecontents)
filecontents = re.sub(r"[-\"#/@;:<>?{}*`• ?+=~|$.!‘?“”?,_]", " ", filecontents)
filecontents = re.sub(r'\d+', '', filecontents)#removing all numbers
filecontents = re.sub(' +', ' ',filecontents)
#filecontents = filecontents.encode().decode('unicode-escape')
filecontents = ''.join([line.lower() for line in filecontents])
if "," in product:
product_patterns = product.split(',')
product_patterns = [i.strip() for i in product_patterns]
for elem in product_patterns:
matcher.add('PRODUCT', None, nlp(elem))
else:
matcher.add('PRODUCT', None, nlp(product))
print(filecontents)
doc = nlp(filecontents)
matches = matcher(doc)
#print(matches)
list_str_index = [str_index_conversion('PRODUCT', doc, x) for x in matches]
to_train_ents.append((filecontents, dict(entities=list_str_index)))
break
except Exception as e:
print(e)
pass
to_train_entsfinal=to_train_ents
def main(model=None, output_dir=None, n_iter=100):
# nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
optimizer = nlp.begin_training()
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
for itn in range(10):
losses = {}
random.shuffle(to_train_entsfinal)
for item in to_train_entsfinal:
nlp.update([item[0]],
[item[1]],
sgd=optimizer,
drop=0.50,
losses=losses)
print(losses)
print("OUTTTTT")
if output_dir is None:
output_dir = "C:\\Users\\APRIL"
noutput_dir = Path(output_dir)
if not noutput_dir.exists():
noutput_dir.mkdir()
#nlp.meta['name'] = new_model_name
nlp.to_disk(output_dir)
random.shuffle(to_train_entsfinal)
if __name__ == '__main__':
main()
谁能帮我解决这个问题。即使我在10多个行的示例中删除了冲突的实体,例如:
Blister abc.txt
Blisterpack abc.txt
Blisters abc.txt
正在发生相同的问题,模型没有进行训练
建议的更改:
def main(model=None, output_dir=None, n_iter=100):
top_memory_precentage_use = 75 # or what ever number you choose
def handle_memory(ruler):
if psutil.virtual_memory().percent < top_memory_precentage_use:
dump_ruler_nonascii(ruler)
ruler = nlp.begin_training() #or just init the nlp object again
return ruler
# This fitted for my use case
def dump_ruler_nonascii(ruler):
path = Path(os.path.join(self.data_path, 'config.jsonl'))
pattern = ruler.patterns
with open(path, "a", encoding="utf-8") as f:
for line in pattern:
f.write(json.dumps(line, ensure_ascii=False) + "\n")
return ruler
# nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
optimizer = nlp.begin_training()
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
for itn in range(10):
losses = {}
random.shuffle(to_train_entsfinal)
for item in to_train_entsfinal:
nlp.update([item[0]],
[item[1]],
sgd=optimizer,
drop=0.50,
losses=losses)
print(losses)
print("OUTTTTT")
if output_dir is None:
output_dir = "C:\\Users\\APRIL"
noutput_dir = Path(output_dir)
if not noutput_dir.exists():
noutput_dir.mkdir()
#nlp.meta['name'] = new_model_name
nlp.to_disk(output_dir)
random.shuffle(to_train_entsfinal)
if __name__ == '__main__':
main()
答案 0 :(得分:0)
很难告诉您为什么,但是我可以为您提供2个帮助功能来帮助您进行训练。您可以根据自己的使用情况进行调整。就我而言,它是在编写模式,并且每次迭代时都要检查内存使用情况。
#add the following imports
import psutil
import os
top_memory_precentage_use = 75 # or what ever number you choose
def handle_memory(ruler):
if psutil.virtual_memory().percent < top_memory_precentage_use:
dump_ruler_nonascii(ruler)
ruler = nlp.begin_training() #or just init the nlp object again
return ruler
# This fitted for my use case
def dump_ruler_nonascii(ruler):
path = Path(os.path.join(self.data_path, 'config.jsonl'))
pattern = ruler.patterns
with open(path, "a", encoding="utf-8") as f:
for line in pattern:
f.write(json.dumps(line, ensure_ascii=False) + "\n")