在标记数据上使用Pos Tag,它正以单词pos_tag的形式出现。 通过相同的词进行词法化时,只有第一个值被词法化。
具有两列的数据框-
ID Text
1 Lemmatization is an interesting part
标记化并删除停用词后-
ID Tokenize_data
1 'Lemmatization', 'interesting', 'part'
#Lemmatization with postag
#Part of Speech Tagging
df2['tag_words'] = df2.tokenize_data.apply(nltk.pos_tag)
#Treebank to Wordnet
from nltk.corpus import wordnet
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return None
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def tagging(text):
#tagged = nltk.pos_tag(tokens)
for (word, tag) in text:
wntag = get_wordnet_pos(tag)
if wntag is None:# not supply tag in case of None
lemma = lemmatizer.lemmatize(word)
else:
lemma = lemmatizer.lemmatize(word, pos=wntag)
return lemma
tag1 = lambda x: tagging(x)
df2['lemma_tag'] = df2.tag_words.apply(tag1)
输出为-
ID Lemma_words
1 'Lemmatize'
预期-
ID Lemma_words
1 'Lemmatize', 'interest', 'part'
答案 0 :(得分:1)
以下功能有效-
我的代码未在pos标记列表中保留所有元组的值,因此输出中只有一个值
def lemmatize_sentence(text):
#tokenize the sentence and find the POS tag for each token
nltk_tagged = nltk.pos_tag(nltk.word_tokenize(text))
#tuple of (token, wordnet_tag)
wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
lemmatized_sentence = []
for word, tag in wordnet_tagged:
if tag is None:
#if there is no available tag, append the token as is
lemmatized_sentence.append(word)
else:
#else use the tag to lemmatize the token
lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
return lemmatized_sentence