Question

我正在集中处理大量数据，这些数据有两个不同的集群。

第一种类型是6维簇，而第二种类型是12维簇。现在我决定使用kmeans（因为它似乎是最直观的聚类算法）。

问题是如何在2d图上映射这些聚类，以便我可以推断kmeans是否正常工作。我想使用matplotlib，但任何其他python包都可以。

集群1是由这些数据类型组成的集群（int，float，float，int，float，int）

群集2是由12种浮点类型组成的群集。

尝试获得与此类似的输出 enter image description here 任何提示都会有用。

Answer 1

在搜索互联网并获得许多奇怪的评论后，解决方案。我弄清楚了怎么做。如果您尝试执行类似的操作，请参阅以下代码。它包含来自各种来源的代码，其中很多是由我编写/编辑的。我希望它比其他人更容易理解。

该函数基于来自scipy的kmeans2，它返回centroid_list和label_list。 kmeansdata是传递给kmeans2进行聚类的numpy数组，num_clusters表示传递给kmeans2的聚类数。

代码会写回一个新的png文件，确保它不会覆盖其他内容。同样只绘制50个簇（如果你有1000个簇，那么不要尝试输出所有簇）

（它是为python2.7编写的，我猜也适用于其他版本。）

import numpy
import colorsys
import random
import os
from matplotlib.mlab import PCA as mlabPCA
from matplotlib import pyplot as plt


def get_colors(num_colors):
    """
    Function to generate a list of randomly generated colors
    The function first generates 256 different colors and then
    we randomly select the number of colors required from it
    num_colors        -> Number of colors to generate
    colors            -> Consists of 256 different colors
    random_colors     -> Randomly returns required(num_color) colors
    """
    colors = []
    random_colors = []
    # Generate 256 different colors and choose num_clors randomly
    for i in numpy.arange(0., 360., 360. / 256.):
        hue = i / 360.
        lightness = (50 + numpy.random.rand() * 10) / 100.
        saturation = (90 + numpy.random.rand() * 10) / 100.
        colors.append(colorsys.hls_to_rgb(hue, lightness, saturation))

    for i in range(0, num_colors):
        random_colors.append(colors[random.randint(0, len(colors) - 1)])
    return random_colors


def random_centroid_selector(total_clusters , clusters_plotted):
    """
    Function to generate a list of randomly selected
    centroids to plot on the output png
    total_clusters        -> Total number of clusters
    clusters_plotted      -> Number of clusters to plot
    random_list           -> Contains the index of clusters
                             to be plotted
    """
    random_list = []
    for i in range(0 , clusters_plotted):
        random_list.append(random.randint(0, total_clusters - 1))
    return random_list

def plot_cluster(kmeansdata, centroid_list, label_list , num_cluster):
    """
    Function to convert the n-dimensional cluster to 
    2-dimensional cluster and plotting 50 random clusters
    file%d.png    -> file where the output is stored indexed
                     by first available file index
                     e.g. file1.png , file2.png ...
    """
    mlab_pca = mlabPCA(kmeansdata)
    cutoff = mlab_pca.fracs[1]
    users_2d = mlab_pca.project(kmeansdata, minfrac=cutoff)
    centroids_2d = mlab_pca.project(centroid_list, minfrac=cutoff)


    colors = get_colors(num_cluster)
    plt.figure()
    plt.xlim([users_2d[:, 0].min() - 3, users_2d[:, 0].max() + 3])
    plt.ylim([users_2d[:, 1].min() - 3, users_2d[:, 1].max() + 3])

    # Plotting 50 clusters only for now
    random_list = random_centroid_selector(num_cluster , 50)

    # Plotting only the centroids which were randomly_selected
    # Centroids are represented as a large 'o' marker
    for i, position in enumerate(centroids_2d):
        if i in random_list:
            plt.scatter(centroids_2d[i, 0], centroids_2d[i, 1], marker='o', c=colors[i], s=100)


    # Plotting only the points whose centers were plotted
    # Points are represented as a small '+' marker
    for i, position in enumerate(label_list):
        if position in random_list:
            plt.scatter(users_2d[i, 0], users_2d[i, 1] , marker='+' , c=colors[position])

    filename = "name"
    i = 0
    while True:
        if os.path.isfile(filename + str(i) + ".png") == False:
            #new index found write file and return
            plt.savefig(filename + str(i) + ".png")
            break
        else:
            #Changing index to next number
            i = i + 1
    return

Answer 2

   plot_cluster(X[:], kmean.cluster_centers_, kmean.labels_, clusters)

将多维集群绘制到2D绘图python

2 个答案: