我正在尝试正确设置Spacy以使用我的Pyspark代码。我只是尝试使用Spacy的功能,例如为各种文本,引理等提取POS。问题是它失败了。
我正在Windows 10上使用python 3.7.0和pyspark V2.4.3运行。我遵循的技术是使用用户定义函数(UDF),类似于克里斯({3)}。这个想法是循环遍历文本正文并将其分成多个块,然后将每个块传递给UDF函数以运行Spacy。
def spacy_preprocessing_udf(text, lowercase, remove_stop):
print("inside spacy_preprocessing")
def preprocess(text):
lowercase = True
remove_stop = True
arry = ["testing"]
#print("inside preprocess")
#print(text)
global nlp
try:
#print("inside try")
stops = spacy.lang.en.stop_words.STOP_WORDS
if lowercase:
text = text.lower()
text = nlp(text)
#print("after doc")
#print("before loop try")
teststt = ""
listofword_pos = list()
for word in text:
lemma = word.lemma_.strip()
if lemma:
if not remove_stop or (remove_stop and lemma not in stops):
lemnlp = nlp(lemma)
lemmaPOS = "dum"
for leword in lemnlp:
lemmaPOS = leword.pos_.strip()
# tuplpair = (lemma, lemmaPOS)
testst = lemma + " : " + lemmaPOS
print(testst)
teststt = testst
listofword_pos.append(testst)
return listofword_pos
except:
#print("inside except")
nlp = spacy.load("en")
stops = spacy.lang.en.stop_words.STOP_WORDS
if lowercase:
text = text.lower()
text = nlp(text)
listofword_pos = list()
teststt = ""
#print("before loop")
for word in text:
lemma = word.lemma_.strip()
if lemma:
if not remove_stop or (remove_stop and lemma not in stops):
lemnlp = nlp(lemma)
lemmaPOS = "dum"
for leword in lemnlp:
lemmaPOS = leword.pos_.strip()
#tuplpair = (lemma, lemmaPOS)
testst = lemma + " : " + lemmaPOS
print(testst)
teststt = testst
listofword_pos.append(testst)
print(len(listofword_pos))
return listofword_pos
res_udf = F.udf(preprocess(text), ArrayType(StringType()))
return res_udf
loop:
spacy_result = spacy_preprocessing_udf(text_list[index], True, True)
spacylist = spacylist.append(spacy_result)
问题是代码成功进入Spacy方法内部,甚至启动Spacy并调用所有这些Spacy方法(POS,Lemma等)。调试时,我会看到正确的示例,例如“ run:verb” 。问题是它没有正确返回最终列表。我收到以下错误
TYpeError:无效的函数:不是函数或可调用的(未定义 call ):类'list'
起源于在UDF中返回列表的时间。任何帮助将不胜感激。
谢谢。