i有一个点列表,这些点是kmeans算法的惯性值。
为了确定群集的最佳数量,我需要找到该曲线开始变平的点。
数据示例
这是我的值列表的创建和填充方式:
sum_squared_dist = []
K = range(1,50)
for k in K:
km = KMeans(n_clusters=k, random_state=0)
km = km.fit(normalized_modeling_data)
sum_squared_dist.append(km.inertia_)
print(sum_squared_dist)
我如何找到该曲线的螺距增加(曲线在下降,因此一阶导数为负)的点?
我的方法
derivates = []
for i in range(len(sum_squared_dist)):
derivates.append(sum_squared_dist[i] - sum_squared_dist[i-1])
我想使用弯头方法找到任何给定数据的最佳簇数。有人可以帮我找到惯性值列表开始变平的点吗?
修改
数据点:
[7342.1301373073857, 6881.7109460930769, 6531.1657905495022,
6356.2255554679778, 6209.8382535595829, 6094.9052166741121,
5980.0191582610196, 5880.1869867848218, 5779.8957906367368,
5691.1879324562778, 5617.5153566271356, 5532.2613232619951,
5467.352265375117, 5395.4493783888756, 5345.3459908298091,
5290.6769823693812, 5243.5271656371888, 5207.2501206569532,
5164.9617535255456]
答案 0 :(得分:2)
我研究了以a Python package为蓝本的Kneedle algorithm。它找到x=5
作为曲线开始变平的点。文档和本文详细讨论了选择拐点的算法。
y = [7342.1301373073857, 6881.7109460930769, 6531.1657905495022,
6356.2255554679778, 6209.8382535595829, 6094.9052166741121,
5980.0191582610196, 5880.1869867848218, 5779.8957906367368,
5691.1879324562778, 5617.5153566271356, 5532.2613232619951,
5467.352265375117, 5395.4493783888756, 5345.3459908298091,
5290.6769823693812, 5243.5271656371888, 5207.2501206569532,
5164.9617535255456]
x = range(1, len(y)+1)
from kneed import KneeLocator
kn = KneeLocator(x, y, curve='convex', direction='decreasing')
print(kn.knee)
5
import matplotlib.pyplot as plt
plt.xlabel('number of clusters k')
plt.ylabel('Sum of squared distances')
plt.plot(x, y, 'bx-')
plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
答案 1 :(得分:1)
对于所有想要自己执行此操作的人,这里有一个简单的基本实现。
它非常适合我的用例(用于计算的200个簇作为边界),距离的计算非常基础,并且基于2D空间中的点到点,但是可以适应任何其他数量的图形。
我认为Kevin的库在技术上是最新的,并且实施得更好。
import KMeansClusterer
from math import sqrt, fabs
from matplotlib import pyplot as plp
import multiprocessing as mp
import numpy as np
class ClusterCalculator:
m = 0
b = 0
sum_squared_dist = []
derivates = []
distances = []
line_coordinates = []
def __init__(self, calc_border, data):
self.calc_border = calc_border
self.data = data
def calculate_optimum_clusters(self, option_parser):
if(option_parser.multiProcessing):
self.calc_mp()
else:
self.calculate_squared_dist()
self.init_opt_line()
self.calc_distances()
self.calc_line_coordinates()
opt_clusters = self.get_optimum_clusters()
print("Evaluated", opt_clusters, "as optimum number of clusters")
self.plot_results()
return opt_clusters
def calculate_squared_dist(self):
for k in range(1, self.calc_border):
print("Calculating",k, "of", self.calc_border, "\n", (self.calc_border - k), "to go!")
kmeans = KMeansClusterer.KMeansClusterer(k, self.data)
ine = kmeans.calc_custom_params(self.data, k).inertia_
print("inertia in round", k, ": ", ine)
self.sum_squared_dist.append(ine)
def init_opt_line(self):
self. m = (self.sum_squared_dist[0] - self.sum_squared_dist[-1]) / (1 - self.calc_border)
self.b = (1 * self.sum_squared_dist[0] - self.calc_border*self.sum_squared_dist[0]) / (1 - self.calc_border)
def calc_y_value(self, x_calc):
return self.m * x_calc + self.b
def calc_line_coordinates(self):
for i in range(0, len(self.sum_squared_dist)):
self.line_coordinates.append(self.calc_y_value(i))
def calc_distances(self):
for i in range(0, self.calc_border):
y_value = self.calc_y_value(i)
d = sqrt(fabs(self.sum_squared_dist[i] - self.calc_y_value(i)))
length_list = len(self.sum_squared_dist)
self.distances.append(sqrt(fabs(self.sum_squared_dist[i] - self.calc_y_value(i))))
print("For border", self.calc_border, ", calculated the following distances: \n", self.distances)
def get_optimum_clusters(self):
return self.distances.index((max(self.distances)))
def plot_results(self):
plp.plot(range(0, self.calc_border), self.sum_squared_dist, "bx-")
plp.plot(range(0, self.calc_border), self.line_coordinates, "bx-")
plp.xlabel("Number of clusters")
plp.ylabel("Sum of squared distances")
plp.show()
def calculate_squared_dist_sliced_data(self,output, proc_numb, start, end):
temp = []
for k in range(start, end + 1):
kmeans = KMeansClusterer.KMeansClusterer(k, self.data)
ine = kmeans.calc_custom_params(self.data, k).inertia_
print("Process", proc_numb,"had the CPU,", "calculated", ine, "in round", k)
temp.append(ine)
output.put((proc_numb, temp))
def sort_result_queue(self, result):
result.sort()
result = [r[1] for r in result]
flat_list= [item for sl in result for item in sl]
return flat_list
def calc_mp(self):
output = mp.Queue()
processes = []
processes.append(mp.Process(target=self.calculate_squared_dist_sliced_data, args=(output, 1, 1, 50)))
processes.append(mp.Process(target=self.calculate_squared_dist_sliced_data, args=(output, 2, 51, 100)))
processes.append(mp.Process(target=self.calculate_squared_dist_sliced_data, args=(output, 3, 101, 150)))
processes.append(mp.Process(target=self.calculate_squared_dist_sliced_data, args=(output, 4, 151, 200)))
for p in processes:
p.start()
#lock code and wait for all processes to finsish
for p in processes:
p.join()
results = [output.get() for p in processes]
self.sum_squared_dist = self.sort_result_queue(results)