我试图在多个数据集上绘制多个聚类算法的结果
代码如下
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from time import time
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
np.random.seed(0)
cs_arry=[[0,1,0,1],[1,0,0,1],[0,0,1,1],[1,1,0,1],[1,0,1,1],[0,1,1,1]]
cs_arry=np.array(cs_arry * 20)
colors = np.array([x for x in cs_arry])
#################
n_pts = 1500#####
#################
noisy_circles = datasets.make_circles(n_samples=n_pts, factor=.5, noise=.05)
no_structure = np.random.rand(n_pts, 2), None
my_circles=make_circ(n_samples=n_pts, factor=.35, noise=.025, in_fact=2)
##############################################
blob_centers = [(-10, -10), (-3, 10), (10, 10)] # adjust blob centers for blob data making
##############################################
blobs=datasets.make_blobs(n_samples=n_pts, n_features=3, centers=blob_centers, cluster_std=3,
center_box=(-20,20), shuffle=True, random_state=0)
clustering_names=['Input','K-means','K-medoids','AP','Spectral','Gaussian','Hierarchical']
nofclusters=3
plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=0,
hspace=0)
data_sets = [noisy_circles, my_circles, blobs, no_structure]
plot_num = 1
for i_dataset, dataset in enumerate(data_sets):
x, y = dataset
x = StandardScaler().fit_transform(x)
bandwidth = cluster.estimate_bandwidth(x, quantile=0.3)
connectivity = kneighbors_graph(x, n_neighbors=10, include_self=False)
connectivity = 0.5 * (connectivity + connectivity.T)
#clustering algorithms
orig_data=cluster.KMeans(n_clusters=1)
kmeans=cluster.KMeans(n_clusters=nofclusters,init='k-means++')
Kmedoids=cluster.KMeans(n_clusters=nofclusters,init='random')
heir = cluster.AgglomerativeClustering(n_clusters=nofclusters, linkage='ward',
connectivity=connectivity)
spectral = cluster.SpectralClustering(n_clusters=2,
eigen_solver='arpack',
affinity="nearest_neighbors")
affinity_propagation = cluster.AffinityPropagation(damping=.9,
preference=-200)
gmm=mixture.gmm(n_components=nofclusters, covariance_type='diag', random_state=None,
thresh=None, tol=0.001, min_covar=0.001, n_iter=100, n_init=1, params='wmc',
init_params='wmc', verbose=0)
#####################################
#####################################
clustering_algo=[orig_data, kmeans, kmedoids, ap, spectral, gmm, heir]
#####################################
#####################################
for name, algorithm in zip(clustering_names, clustering_algo):
t0 = time.time()
algorithm.fit(x)
t1 = time.time()
if hasattr(algorithm, 'labels_'):
y_pred = algorithm.labels_.astype(np.int)
else:
y_pred = algorithm.predict(x)
# plot
plt.subplot(len(datasets), len(clustering_algo), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
if name=='Input':
plt.scatter(x[:, 0], x[:, 1], color='k', s=10, marker='o', facecolors='none')
else: plt.scatter(x[:, 0], x[:, 1], color=colors[y_pred], s=10, marker='o', facecolors='none')
plt.xlim(-2, 2)
plt.ylim(-2, 2)
plt.xticks(())
plt.yticks(())
if name!='Input':
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=15,
horizontalalignment='right')
plot_num += 1
plt.show()
我想知道是否有办法为不同的数据集设置不同数量的聚类。具体来说,我希望圆形数据有2个簇,而其余的组有3个。
我尝试插入
if data_sets==noisy_circles or my_circles:
nofclusters=2
或
while data_sets==noisy_circles or my_circles:
nofclusters=2
在for循环语句之前和之后没有结果。
感谢任何帮助。
谢谢。
答案 0 :(得分:0)
使用字典,这样您就可以轻松配置每个数据集的簇数。您还需要选择许多其他参数,例如DBSCAN中的epsilon和minpts。
显然你需要在里面 for循环,并确保在每次迭代时设置值。否则,下一次迭代可能会得到之前的值,结果会得到错误。