我在存储为pandas数据帧的数据集上使用NLTK。所有原始文本处理程序都正常工作,直到我尝试将Treebank POS标记转换为Wordnet POS标记。这些代码对我来说很好。
import pandas as pd
import string
from nltk import WordPunctTokenizer, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn, stopwords
# Example dataframe
df = pd.DataFrame([[2, "I am new at programming."],
[7, "Leaves are falling from the tree."],
[4, "Sophia has been studying since this morning."]], columns = ['ID', 'Text'])
# Tokenize text
tokenizer = nltk.WordPunctTokenizer()
df["Tokens"] = df["Text"].str.lower().apply(tokenizer.tokenize)
# Remove punctuations
pattern = string.punctuation
print(pattern)
def remove_punctuation(tokens):
filtered = [word for word in tokens if word not in pattern]
return filtered
df["Tokens"] = df["Tokens"].apply(remove_punctuation)
# Remove stopwords
stopwords = stopwords.words('english')
def remove_stopwords(tokens):
filtered_words = [word for word in tokens if word not in stopwords]
return filtered_words
df["Tokens"] = df["Tokens"].apply(remove_stopwords)
以下代码行不起作用,我收到了这个错误:
ValueError:解压缩的值太多(预期2)
def wordnet_pos(pos_tag):
if pos_tag.startswith('J'):
return wn.ADJ
elif pos_tag.startswith('V'):
return wn.VERB
elif pos_tag.startswith('N'):
return wn.NOUN
elif pos_tag.startswith('R'):
return wn.ADV
else:
return None
def wordnet(tokens):
pos_tokens = [nltk.pos_tag(token) for token in tokens]
pos_tokens = [(word, wordnet_pos(pos_tag)) for (word, pos_tag) in pos_tokens]
return pos_tokens
df["Wordnet"] = df["Tokens"].apply(wordnet)
这是我希望实现的 - 使用Wordnet POS标签创建df["Wordnet"]
。
print(df["Wordnet"])
0 [(new, a), (programming, n)]
1 [(leaves, n), (falling, v), (tree, n)]
2 [(sophia, n), (studying, v), (since, n), (...
Name: Wordnet, dtype: object