我正在尝试从Pandas数据帧上的nltk包中实现词干提取和词条抽取。我编写了以下函数,但在某处未执行词干和词根化。请让我知道需要进行的更改。
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import LancasterStemmer
stemmer=LancasterStemmer()
lemmer=WordNetLemmatizer()
alpha_tokenizer=RegexpTokenizer('[A-Za-z]\w+')
def process_sentence(words):
words=words.lower()
tokens=alpha_tokenizer.tokenize(words)
for index,word in enumerate(tokens):
tokens[index]=stemmer.stem(word)
tokens[index]=lemmer.lemmatize(word,'v')
tokens[index]=lemmer.lemmatize(word,'n')
return tokens
print([process_sentence(item) for item in ['abaci', 'happy dogs']])
# [['abacus'], ['happy', 'dog']]