如何使用python在PDF数据上应用KMeans聚类?

时间:2018-11-26 22:55:19

标签: python scikit-learn k-means text-mining unsupervised-learning

关于此主题:How to convert token list into wordnet lemma list using nltk?

我想在聚类图中显示具有相似含义的单词。我经历了一些方法,发现KMeans是学习的良好开端。我正在使用tf-idf矢量化器将我的pdf数据转换为矢量化版本,但最终收到此错误:

ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df.

这是我更新的源代码:(旨在用于任何pdf数据源)

import string
import re
import nltk
import PyPDF4
import plotly as py
import collections
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
from nltk.corpus import wordnet

stopwords = nltk.corpus.stopwords.words('english')
# additional stopwords to be removed manually.
file = open('Corpus.txt', 'r')
moreStopwords = file.read().splitlines()
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

data = PyPDF4.PdfFileReader(open('ReadyPlayerOne.pdf', 'rb'))
pageData = ''
for page in data.pages:
    pageData += page.extractText()


def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokenize = re.split("\W+", text)
    text = [wn.lemmatize(word) for word in tokenize if word not in stopwords]
    final = [word for word in text if word not in moreStopwords]
    # Accessing wordnet synset corpora to find the meaning of the words.
    lemmas = []
    for token in final:
        lemmas += [synset.lemmas()[0].name() for synset in wordnet.synsets(token)]
    return list(set(lemmas))  # returns unique words
    # return list(lemmas)
    # return final


def cluster_texts(texts, cluster):
    # K-Means Clustering
    vectorizer = TfidfVectorizer(tokenizer=clean_text,
                                 max_df=0.9,
                                 min_df=0.1)

    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=cluster)
    km_model.fit(tfidf_model)
    clustering = collections.defaultdict(list)
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
    return clustering


filter_data = clean_text(pageData)
clusters = cluster_texts(filter_data, 3)
pprint(dict(clusters))

0 个答案:

没有答案