关于此主题:How to convert token list into wordnet lemma list using nltk?
我想在聚类图中显示具有相似含义的单词。我经历了一些方法,发现KMeans是学习的良好开端。我正在使用tf-idf矢量化器将我的pdf数据转换为矢量化版本,但最终收到此错误:
ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df.
这是我更新的源代码:(旨在用于任何pdf数据源)
import string
import re
import nltk
import PyPDF4
import plotly as py
import collections
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
from nltk.corpus import wordnet
stopwords = nltk.corpus.stopwords.words('english')
# additional stopwords to be removed manually.
file = open('Corpus.txt', 'r')
moreStopwords = file.read().splitlines()
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()
data = PyPDF4.PdfFileReader(open('ReadyPlayerOne.pdf', 'rb'))
pageData = ''
for page in data.pages:
pageData += page.extractText()
def clean_text(text):
text = "".join([word.lower() for word in text if word not in string.punctuation])
tokenize = re.split("\W+", text)
text = [wn.lemmatize(word) for word in tokenize if word not in stopwords]
final = [word for word in text if word not in moreStopwords]
# Accessing wordnet synset corpora to find the meaning of the words.
lemmas = []
for token in final:
lemmas += [synset.lemmas()[0].name() for synset in wordnet.synsets(token)]
return list(set(lemmas)) # returns unique words
# return list(lemmas)
# return final
def cluster_texts(texts, cluster):
# K-Means Clustering
vectorizer = TfidfVectorizer(tokenizer=clean_text,
max_df=0.9,
min_df=0.1)
tfidf_model = vectorizer.fit_transform(texts)
km_model = KMeans(n_clusters=cluster)
km_model.fit(tfidf_model)
clustering = collections.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
clustering[label].append(idx)
return clustering
filter_data = clean_text(pageData)
clusters = cluster_texts(filter_data, 3)
pprint(dict(clusters))