我正在尝试构建一个自定义NER模型,以使用Spacy训练某种实体类型。 我已将数据标记为csv格式,第一列中包含实体,第二列中包含文件名。 样本:
Packets abc.txt
Sachets abc.txt
Bags xyz.txt
Pouch xyz.txt
cap def.txt
我有大约5K的此类数据行需要标记。文件总数为700+。问题是模型在1次迭代后停止训练。因此,从代码开始,它将print(losses)print("OUTTTTT")
然后停止。带标签的实体是一个非常大的元组。它没有执行指定的10次迭代训练。谁能帮我解决此问题。下面附上代码以供参考。
参考代码:
import spacy
import psutil
import re
import csv
from spacy.matcher import PhraseMatcher
import plac
from pathlib import Path
import random
import psutil
import os
import json
import pdb
#Function to convert PhraseMatcher return value to string indexes
def str_index_conversion(lbl, doc, matchitem):
o_one = len(str(doc[0:matchitem[1]]))
subdoc = doc[matchitem[1]:matchitem[2]]
o_two = o_one + len(str(subdoc))
return (o_one, o_two, lbl)
# nlp = spacy.load('en')
nlp = spacy.load('en')
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)
else:
ner = nlp.get_pipe('ner')
ner.add_label('PRODUCT')
DIR = 'D:/Data'
matcher = PhraseMatcher(nlp.vocab, attr='lemma', validate=True)
list_str_index = []
to_train_ents = []
with open(r'D:\type16.csv', newline='', encoding ='utf-8') as myFile:
reader = csv.reader(myFile)
for row in reader:
try:
prod = row[0].lower()
#print('K---'+ product)
filename = row[1]
file = open(DIR+filename, "r", encoding ='utf-8')
print(file)
filedata = file.read()
for s in filedata:
filedata = re.sub(r'\s+', ' ', filedata)
filedata = re.sub(r'^https?:\/\/.*[\r\n]*', '', filedata, flags=re.MULTILINE)
filedata = re.sub(r"http\S+", "", filedata)
filedata = re.sub(r"[-\"#/@;:<>?{}*`• ?+=~|$.!‘?“”?,_]", " ", filedata)
filedata = re.sub(r'\d+', '', filedata)#removing all numbers
filedata = re.sub(' +', ' ',filedata)
#filedata = filedata.encode().decode('unicode-escape')
filedata = ''.join([line.lower() for line in filedata])
if "," in product:
product_patterns = product.split(',')
product_patterns = [i.strip() for i in product_patterns]
for elem in product_patterns:
matcher.add('PRODUCT', None, nlp(elem))
else:
matcher.add('PRODUCT', None, nlp(product))
print(filedata)
doc = nlp(filedata)
matches = matcher(doc)
#print(matches)
list_str_index = [str_index_conversion('PRODUCT', doc, x) for x in matches]
to_train_ents.append((filedata, dict(entities=list_str_index)))
break
except Exception as e:
print(e)
pass
to_train_entsfinal=to_train_ents
# to_test_ents=to_train_ents[5000:5199]
def main(model=None, output_dir=None):
# top_memory_precentage_use = 100 # or what ever number you choose
# def handle_memory(ruler):
# if psutil.virtual_memory().percent < top_memory_precentage_use:
# dump_ruler_nonascii(ruler)
# ruler = nlp.begin_training() #or just init the nlp object again
# return ruler
# def dump_ruler_nonascii(ruler):
# path = Path(os.path.join(self.data_path, 'config.jsonl'))
# pattern = ruler.patterns
# with open(path, "a", encoding="utf-8") as f:
# for line in pattern:
# f.write(json.dumps(line, ensure_ascii=False) + "\n")
# return ruler
# pdb.set_trace()
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):
optimizer = nlp.begin_training()
# only train NER
for itn in range(10):
losses = {}
random.shuffle(to_train_entsfinal)
for item in to_train_entsfinal:
print(item)
try:
print('FOUND TAGGED NER')
nlp.update([item[0]],
[item[1]],
sgd=optimizer,
drop=0.2,
losses=losses)
except Exception as e:
print("Found exception")
pass
print(losses)
print("OUTTTTT")
if output_dir is None:
output_dir = "D:\\15april"
noutput_dir = Path(output_dir)
if not noutput_dir.exists():
noutput_dir.mkdir()
#nlp.meta['name'] = new_model_name
nlp.to_disk(output_dir)
random.shuffle(to_train_entsfinal)
if __name__ == '__main__':
main()