Sklearn BIC准则:用于聚类的不同k的最佳值

时间:2018-01-19 23:35:03

标签: python-3.x cluster-analysis

我想确定KMeans算法和数据集的k(群集数)的最佳值。 我在Sklearn的文档中找到了一个资源:使用BIC标准的高斯混合模型选择。 我在网站上找到了一个我适应我的数据集的代码示例。 但是,此代码的每次运行都会给出k 的最佳值的不同值。为什么? 代码如下:

import numpy as np
import pandas as pd
import itertools

from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn import mixture

print(__doc__)

# Number of samples per component
n_samples = 440


path = 'C:/Users/Lionel/Downloads'
file = 'Wholesale customers data.csv'
data = pd.read_csv(path + '/'+file)
X = np.array(data.iloc[:,2 :])

lowest_bic = np.infty
bic = []
n_components_range = range(1, 12)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    for n_components in n_components_range:
        # Fit a Gaussian mixture with EM
        gmm = mixture.GaussianMixture(n_components=n_components,
                                  covariance_type=cv_type)
    gmm.fit(X)
    bic.append(gmm.bic(X))
    if bic[-1] < lowest_bic:
        lowest_bic = bic[-1]
        best_gmm = gmm

bic = np.array(bic)
color_iter = itertools.cycle(['navy', 'turquoise', 'cornflowerblue',
                          'darkorange'])
clf = best_gmm
print(clf)
bars = []

# Plot the BIC scores
spl = plt.subplot(2, 1, 1)
#spl = plt.plot()
for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
  xpos = np.array(n_components_range) + .2 * (i - 2)
  bars.append(plt.bar(xpos, bic[i * len(n_components_range):
                              (i + 1) * len(n_components_range)],
                    width=.2, color=color))
plt.xticks(n_components_range)
plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()])
plt.title('BIC score per model')
xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 +\
.2 * np.floor(bic.argmin() / len(n_components_range))
plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14)
spl.set_xlabel('Number of components')
spl.legend([b[0] for b in bars], cv_types)

# Plot the winner
splot = plt.subplot(2, 1, 2)

Y_ = clf.predict(X)
for i, (mean, cov, color) in enumerate(zip(clf.means_, clf.covariances_,
                                       color_iter)):
    v, w = linalg.eigh(cov)
    if not np.any(Y_ == i):
        continue
    plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)

# Plot an ellipse to show the Gaussian component
angle = np.arctan2(w[0][1], w[0][0])
angle = 180. * angle / np.pi  # convert to degrees
v = 2. * np.sqrt(2.) * np.sqrt(v)
ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
ell.set_clip_box(splot.bbox)
ell.set_alpha(.5)
splot.add_artist(ell)

plt.xticks(())
plt.yticks(())
plt.title('Selected GMM: full model, 2 components')
plt.subplots_adjust(hspace=.35, bottom=.02)
plt.show()

这里是我的数据集的链接: https://drive.google.com/open?id=1yMw1rMh12ml6Lh3yrL6WDLbEnLM-SmiN

您对这种行为有解释吗?

0 个答案:

没有答案