我正在寻找一种有效的方法,以将多个sklearn聚类算法应用于多个数据帧而无需过多重复。
import pandas as pd
import numpy as np
from sklearn.datasets import make_moons,make_blobs
from sklearn.cluster import KMeans, DBSCAN
from matplotlib import pyplot
X1, y1 = make_moons(n_samples=100, noise=0.1)
X2, y2 = make_blobs(n_samples=100, centers=3, n_features=2)
我想对这些数据集同时应用kmeans和dbscan,但是每个数据集都需要不同的参数,如何使用循环将多个模型应用于多个数据并最终将它们绘制在网格中?谢谢。
答案 0 :(得分:1)
您几乎没有创建用于定义每个数据集| clustering_algo组合的超参数的dict。
以下方法可能对您有用! [从sklearn clustering's documentation] 开发
import pandas as pd
import numpy as np
from sklearn.datasets import make_moons,make_blobs
from sklearn.cluster import KMeans, DBSCAN
from matplotlib import pyplot as plt
noisy_moons = make_moons(n_samples=100, noise=0.1)
blobs = make_blobs(n_samples=100, centers=3 , center_box = (-1,1),cluster_std=0.1)
colors = np.array(['#377eb8', '#ff7f00', '#4daf4a',
'#f781bf', '#a65628', '#984ea3',
'#999999', '#e41a1c', '#dede00'])
#defining the clustering algo which we want to try
clustering_models = [KMeans,DBSCAN]
from collections import namedtuple
Model = namedtuple('Model', ['name', 'model'])
models = [Model(model.__module__.split('.')[-1][:-1], model)
for model in clustering_models]
#defn of params for each dataset|clustering_algo
datasets_w_hyperparams = [(noisy_moons[0],
{models[0][0]: {'n_clusters': 2}, models[1][0]: {'eps': .3, }}),
(blobs[0],
{models[0][0]: {'n_clusters': 2}, models[1][0]: {'eps': .1, }})]
f,axes=plt.subplots(len(datasets_w_hyperparams),len(models),figsize = (15,10))
for data_id,(dataset,params) in enumerate(datasets_w_hyperparams):
for model_id,model in enumerate(models):
ax = axes[data_id][model_id]
name, clus_model = model
pred = clus_model(**params[name]).fit_predict(dataset)
ax.scatter(dataset[:,0],dataset[:,1], s=20, color= colors[pred])
ax.set_title(name)
plt.show()