我的产品销售时间序列超过5万个。初步的探索性数据分析表明,各种时间序列(产品)具有相似的(销售)模式。因此,我们正在尝试对这些时间序列进行聚类。我们如何使用诸如轮廓分数之类的统计数据来获得最佳聚类数。
We are using DTW/LB_Keogh to calculate the cluster centers.
Data: Array[[0., 0., 1., ..., 2., 0., 2.],
.........,
[ 0., 0., 0., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 0., 2., 0.]])
#Function1: DTW distance code to calculate the dtw distance between two series
def DTWDistance(s1, s2,w):
DTW={}
w = max(w, abs(len(s1)-len(s2)))
for i in range(-1,len(s1)):
for j in range(-1,len(s2)):
DTW[(i, j)] = float('inf')
DTW[(-1, -1)] = 0
for i in range(len(s1)):
for j in range(max(0, i-w), min(len(s2), i+w)):
dist= (s1[i]-s2[j])**2
DTW[(i, j)] = dist + min(DTW[(i-1, j)],DTW[(i, j-1)], DTW[(i-1, j-1)])
return sqrt(DTW[len(s1)-1, len(s2)-1])
##Function2: LB Keoge code
def LB_Keogh(s1,s2,r):
LB_sum=0
for ind,i in enumerate(s1):
lower_bound=min(s2[(ind-r if ind-r>=0 else 0):(ind+r)])
upper_bound=max(s2[(ind-r if ind-r>=0 else 0):(ind+r)])
if i>upper_bound:
LB_sum=LB_sum+(i-upper_bound)**2
elif i<lower_bound:
LB_sum=LB_sum+(i-lower_bound)**2
return sqrt(LB_sum)
#Function3: Kmeans clustering code
import random
from math import sqrt
def k_means_clust(data,num_clust,num_iter,w=5):
centroids=random.sample(data,num_clust)
counter=0
for n in range(num_iter):
counter+=1
print (counter)
assignments={}
#assign data points to clusters
for ind,i in enumerate(data):
min_dist=float('inf')
closest_clust=None
for c_ind,j in enumerate(centroids):
if LB_Keogh(i,j,5)<min_dist:
cur_dist=DTWDistance(i,j,w)
if cur_dist<min_dist:
min_dist=cur_dist
closest_clust=c_ind
if closest_clust in assignments:
assignments[closest_clust].append(ind)
else:
assignments[closest_clust]=[]
#return assignments[closest_clust]
# recalculate centroids of clusters
for key in assignments:
clust_sum=np.zeros(len(data[0]))
for k in assignments[key]:
clust_sum=np.add(clust_sum,data[k])
centroids[key]=[m/len(assignments[key]) for m in clust_sum]
return centroids
# Running Kmeans -cluster algorithm
import matplotlib.pyplot as plt
centroids=k_means_clust(data,4,5,5)
for i in centroids:
plt.plot(i)
plt.show()
I need to use above cluster centers and get cluster labels in the input data.
I want to understand if centroids can be further used to create the cluster lebels on the input file to get the cluster labels on each timeseries.
How can we get it(python3 code)