
时间:2018-11-26 22:55:19

标签: python scikit-learn k-means text-mining unsupervised-learning

关于此主题:How to convert token list into wordnet lemma list using nltk?


ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df.


import string
import re
import nltk
import PyPDF4
import plotly as py
import collections
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
from nltk.corpus import wordnet

stopwords = nltk.corpus.stopwords.words('english')
# additional stopwords to be removed manually.
file = open('Corpus.txt', 'r')
moreStopwords = file.read().splitlines()
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

data = PyPDF4.PdfFileReader(open('ReadyPlayerOne.pdf', 'rb'))
pageData = ''
for page in data.pages:
    pageData += page.extractText()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokenize = re.split("\W+", text)
    text = [wn.lemmatize(word) for word in tokenize if word not in stopwords]
    final = [word for word in text if word not in moreStopwords]
    # Accessing wordnet synset corpora to find the meaning of the words.
    lemmas = []
    for token in final:
        lemmas += [synset.lemmas()[0].name() for synset in wordnet.synsets(token)]
    return list(set(lemmas))  # returns unique words
    # return list(lemmas)
    # return final

def cluster_texts(texts, cluster):
    # K-Means Clustering
    vectorizer = TfidfVectorizer(tokenizer=clean_text,

    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=cluster)
    clustering = collections.defaultdict(list)
    for idx, label in enumerate(km_model.labels_):
    return clustering

filter_data = clean_text(pageData)
clusters = cluster_texts(filter_data, 3)

0 个答案:
