这是我的函数,应该对句子列表进行词组化,但是输出是所有单词的列表,而不是每个词组化句子的列表。
用于词条化函数的代码
tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
def lemmatize(corpus):
lemmatize_list_of _sentences= []
lemmatize_list_of _sentences2 = []
for sentence in corpus:
tags = tagger.tag_text(sentence)
tags2 = treetaggerwrapper.make_tags(tags, allow_extra = True)
lemmatize_list_of_sentences.append(tags2)
print(lemmatize_list_of_sentences)
for subl in lemmatize_list_of_sentences: # loop in list of sublists
for word in subl:
if word.__class__.__name__ == "Tag":
lemme=word[2] # I want also to check if lemme[2] is empty and add this
lemmeOption2=lemme.split("|")
lemme=lemmeOption1[0]
lemmatize_list_of_sentences2.append(lemme)
return lemmatize_list_of_sentences2 # should return a list of lists where each list contains the lemme retrieve
lemmatize_train= lemmatize(sentences_train_remove_stop_words)
lemmatize_test= lemmatize(sentences_test_remove_stop_words)
print(lemmatize_train)
此外,我想向lemmatize函数添加一行代码,以检查index(2)或(-1)是否为空,如果为空,则在第一个索引处检索单词
我想出了这个,但是如何将其与我的lemmatize函数结合起来
for word in subl:
lemme= word.split('\t')
try:
if lemme[2] == '':
lemmatize_list_of _sentences2.append(parts[0])
else:
lemmatize_list_of _sentences2.append(parts[2])
except:
print(parts)
file_input中的句子列表
La période de rotation de la Lune est la même que sa période orbitale et elle présente donc toujours le même hémisphère.
Cette rotation synchrone résulte des frottements qu’ont entraînés les marées causées par la Terre.
在标记文本后,打印句子列表的列表,我有了这个:
第一句话:
[[Tag(word='la', pos='DET:ART', lemma='le'), Tag(word='période', pos='NOM', lemma='période'), Tag(word='rotation', pos='NOM', lemma='rotation'), Tag(word='lune', pos='NOM', lemma='lune'), Tag(word='période', pos='NOM', lemma='période'), Tag(word='orbitale', pos='ADJ', lemma='orbital'), Tag(word='présente', pos='VER:pres', lemma='présenter'), Tag(word='donc', pos='ADV', lemma='donc'), Tag(word='toujours', pos='ADV', lemma='toujours')]]
整个句子:
[[Tag(word='la', pos='DET:ART', lemma='le'), Tag(word='période', pos='NOM', lemma='période'), Tag(word='rotation', pos='NOM', lemma='rotation'), Tag(word='lune', pos='NOM', lemma='lune'), Tag(word='période', pos='NOM', lemma='période'), Tag(word='orbitale', pos='ADJ', lemma='orbital'), Tag(word='présente', pos='VER:pres', lemma='présenter'), Tag(word='donc', pos='ADV', lemma='donc'), Tag(word='toujours', pos='ADV', lemma='toujours')], [Tag(word='cette', pos='PRO:DEM', lemma='ce'), Tag(word='rotation', pos='NOM', lemma='rotation'), Tag(word='synchrone', pos='ADJ', lemma='synchrone'), Tag(word='résulte', pos='VER:pres', lemma='résulter'), Tag(word='frottements', pos='NOM', lemma='frottement'), Tag(word='entraînés', pos='VER:pper', lemma='entraîner'), Tag(word='les', pos='DET:ART', lemma='le'), Tag(word='marées', pos='NOM', lemma='marée'), Tag(word='causées', pos='VER:pper', lemma='causer')]]
检索引理后,我有一个单词列表,这不是我期望的。希望每个句子都有一个列表。
输出:
['le', 'période', 'rotation', 'lune', 'période', 'orbital', 'présenter', 'donc', 'toujours', 'ce', 'rotation', 'synchrone', 'résulter', 'frottement', 'entraîner', 'le', 'marée', 'causer']
期望:将句子中的每个单词放在一个字符串中,单词之间用空格隔开。
['le période rotation lune période orbital présenter donc toujours','ce rotation synchrone résulter frottement entraîner le marée causer']
答案 0 :(得分:0)
因此,您希望有两个标签列表。
您要返回一个简单列表,必须确保要返回一个列表列表。
tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
def lemmatize(corpus):
lemmatize_list_of_sentences= []
lemmatize_list_of_sentences2 = []
for sentence in corpus:
tags = tagger.tag_text(sentence)
tags2 = treetaggerwrapper.make_tags(tags, allow_extra = True)
lemmatize_list_of_sentences.append(tags2)
print(lemmatize_list_of_sentences)
for subl in lemmatize_list_of_sentences: # loop in list of sublists
#Here you create a list to work as a "inner" sentence list.
sentence_lemmas = []
for word in subl:
if word.__class__.__name__ == "Tag":
lemme=word[2] # I want also to check if lemme[2] is empty and add this
lemmeOption2=lemme.split("|")
lemme=lemmeOption2[0] #There was a typo here
sentence_lemmas.append(lemme) #Here you append the lemma extracted
# Here you change the original list in order for it to receive the "inner" list.
lemmatize_list_of_sentences2.append(sentence_lemmas)
return lemmatize_list_of_sentences2 # should return a list of lists where each list contains the lemme retrieve
lemmatize_train= lemmatize(sentences_train_remove_stop_words)
lemmatize_test= lemmatize(sentences_test_remove_stop_words)
print(lemmatize_train)
检查代码是否为空
另外,根据文档(Tree tagger wraper docs),“标记”是“命名元组”。
您可以在this post中进一步了解“命名元组”。
但是,基本上,您可以像对待对象一样使用引用“ Tag”属性。 (点)表示法。
因此,要检查引理是否为空,可以执行以下操作:
if word.lemma != "":
lemme = word.lemma
else:
lemme = word.word.split("/")
加入列表
此外,如果您想最后重新加入引理列表,请执行以下操作:
joined_sentences = []
for lemma_list in lemmatize_train:
joined_sentences.append(" ".join(lemma_list))
print(joined_sentences)
函数返回连接的字符串:
def lemmatize(corpus):
lemmatize_list_of_sentences= []
lemmatize_list_of_sentences2 = []
for sentence in corpus:
tags = tagger.tag_text(sentence)
tags2 = treetaggerwrapper.make_tags(tags, allow_extra = True)
lemmatize_list_of_sentences.append(tags2)
print(lemmatize_list_of_sentences)
for subl in lemmatize_list_of_sentences: # loop in list of sublists
#Here you create a list to work as a "inner" sentence list.
sentence_lemmas = []
for word in subl:
if word.__class__.__name__ == "Tag":
lemme=word[2] # I want also to check if lemme[2] is empty and add this
lemmeOption2=lemme.split("|")
lemme=lemmeOption2[0] #There was a typo here
sentence_lemmas.append(lemme) #Here you append the lemma extracted
lemmatize_list_of_sentences2.append(sentence_lemmas)
joined_sentences= []
for lemma_list in lemmatize_list_of_sentences2:
joined_sentences.append(" ".join(lemma_list))
return joined_sentences
希望现在很清楚。