可能重复:
Python k-means algorithm
我想基于其特征向量聚类 10000个索引点,并且在聚类之后得到它们的ID 即cluster1:[p1,p3,p100,...], cluster2:[...] ......
有没有办法在Python中执行此操作? THX〜
P.S。索引点存储在10000 * 10矩阵中,其中每行代表一个特征向量。
答案 0 :(得分:2)
使用一些聚类算法 - 我在其第二条评论中包含了@Cameron链接到的K-means算法的实现,但您可能想要引用the link in his first comment。我不确定你的ID是什么意思,你能详细说明吗?
from math import sqrt
def k_means(data_pts, k=None):
""" Return k (x,y) pairs where:
k = number of clusters
and each
(x,y) pair = centroid of cluster
data_pts should be a list of (x,y) tuples, e.g.,
data_pts=[ (0,0), (0,5), (1,3) ]
"""
""" Helper functions """
def lists_are_same(la, lb): # see if two lists have the same elements
out = False
for item in la:
if item not in lb:
out = False
break
else:
out = True
return out
def distance(a, b): # distance between (x,y) points a and b
return sqrt(abs(a[0]-b[0])**2 + abs(a[1]-b[1])**2)
def average(a): # return the average of a one-dimensional list (e.g., [1, 2, 3])
return sum(a)/float(len(a))
""" Set up some initial values """
if k is None: # if the user didn't supply a number of means to look for, try to estimate how many there are
n = len(data_pts)# number of points in the dataset
k = int(sqrt(n/2)) # number of clusters - see
# http://en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set#Rule_of_thumb
if k < 1: # make sure there's at least one cluster
k = 1
""" Randomly generate k clusters and determine the cluster centers,
or directly generate k random points as cluster centers. """
init_clusters = data_pts[:] # put all of the data points into clusters
shuffle(init_clusters) # put the data points in random order
init_clusters = init_clusters[0:k] # only keep the first k random clusters
old_clusters, new_clusters = {}, {}
for item in init_clusters:
old_clusters[item] = [] # every cluster has a list of points associated with it. Initially, it's 0
while 1: # just keep going forever, until our break condition is met
tmp = {}
for k in old_clusters: # create an editable version of the old_clusters dictionary
tmp[k] = []
""" Associate each point with the closest cluster center. """
for point in data_pts: # for each (x,y) data point
min_clust = None
min_dist = 1000000000 # absurdly large, should be larger than the maximum distance for most data sets
for pc in tmp: # for every possible closest cluster
pc_dist = distance(point, pc)
if pc_dist < min_dist: # if this cluster is the closest, have it be the closest (duh)
min_dist = pc_dist
min_clust = pc
tmp[min_clust].append(point) # add each point to its closest cluster's list of associated points
""" Recompute the new cluster centers. """
for k in tmp:
associated = tmp[k]
xs = [pt[0] for pt in associated] # build up a list of x's
ys = [pt[1] for pt in associated] # build up a list of y's
x = average(xs) # x coordinate of new cluster
y = average(ys) # y coordinate of new cluster
new_clusters[(x,y)] = associated # these are the points the center was built off of, they're *probably* still associated
if lists_are_same(old_clusters.keys(), new_clusters.keys()): # if we've reached equilibrium, return the points
return old_clusters.keys()
else: # otherwise, we'll go another round. let old_clusters = new_clusters, and clear new_clusters.
old_clusters = new_clusters
new_clusters = {}