我想在散点图中绘制这个例子:
http://scikit-learn.org/dev/auto_examples/document_clustering.html#example-document-clustering-py
我在这里是sklearn和numpy newbie,我想得到矢量的坐标数据,所以我可以绘制。
编辑:
这是我到目前为止所得到的:
'''
Created on Apr 4, 2013
@author: v3ss
'''
from classify import recursive_load_files
from time import time
import numpy as np
import pylab as pl
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans, MiniBatchKMeans
from os.path import isdir
from os import listdir
from os.path import join
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import Perceptron, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.decomposition import RandomizedPCA
from sklearn.utils.validation import check_random_state
from time import time
import numpy as np
import os
import traceback
def clustering_from_files(trainer_path = "./dataset/dataset/training_data/"):
classifier = "NB"
load_files = recursive_load_files
trainer_path = os.path.realpath(trainer_path)
data_train = load_files(trainer_path, load_content = True, shuffle = False)
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.7,
stop_words='english',charset_error="ignore")
X_train = vectorizer.fit_transform(data_train.data)
print "done in %fs" % (time() - t0)
print "Targets:",data_train.target
km = MiniBatchKMeans(n_clusters=15, init='k-means++', n_init=1,
init_size=1000,
batch_size=1000, verbose=1)
# kmeans = KMeans(init='k-means++', n_clusters=5, n_init=1)
print "Clustering sparse data with %s" % km
t0 = time()
return (km,X_train)
def reduce_dems(X_train):
rpca=RandomizedPCA(n_components=2)
return rpca.fit_transform(X_train)
def plot(kmeans,reduced_data):
kmeans.fit(reduced_data)
h = 0.1
x_min, x_max = reduced_data[:, 0].min() + 1, reduced_data[:, 0].max() - 1
y_min, y_max = reduced_data[:, 1].min() + 1, reduced_data[:, 1].max() - 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
pl.figure(1)
pl.clf()
pl.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
pl.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=20, linewidths=3,
color='r', zorder=10)
pl.title('K-means clustering on selected 20_newsgroup (religion group and technology) ')
pl.xlim(x_min, x_max)
pl.ylim(y_min, y_max)
pl.xticks(())
pl.yticks(())
pl.show()
def main():
k_means,X_train = clustering_from_files()
reduced = reduce_dems(X_train)
plot(k_means,reduced)
if __name__ == "__main__":
main()
编辑:
现在效果更好,可以增加群集大小。
答案 0 :(得分:4)
问题在于您的群集本身具有很高的维度。例如,如果您没有使用要素散列,则您的语料库中的每个不同单词都有一个坐标。通常这意味着如果您的语料库相对较大,您将拥有比标准词典中的单词更多的坐标。你可以使用像multi-dimensional scaling之类的嵌入技术来获得你学习的kmeans向量的二维嵌入,你可以绘制它。