在下面的代码中,作者说 -
"在开始kmeans聚类之前,我想使用层次聚类来计算我应该拥有多少个聚类。我截断了树形图,因为如果我没有,那么树形图很难读。我减少了20,因为它有第二大的距离跳跃(第一次大跳跃是60)。切割后有7个簇。"
我无法在树形图中看到他是如何得出他提到的数字--20,60或7 我正在附上我从他的github示例中获取的样本数据中得到的树状图,我想知道是否有人可以说明他是如何得出20,60或7的数字
他还说"让我们在矩阵上使用一系列簇1-19来拟合k-means。"他从哪里获得1到19的范围?它是否导致20下降(或截止时为20)github - https://github.com/moyphilip/SKU-Clustering
还有人会说这附加的第二张图片中的簇数是多少? 6个集群? (它是一个不同的数据集)
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import pandas as pd
import re
import numpy as np
df = pd.read_csv('sample-data.csv')
def split_description(string):
string_split = string.split(' - ',1)
name = string_split[0]
return name
df_new = pd.DataFrame()
df_new['name'] = df.loc[:,'description'].apply(lambda x: split_description(x))
df_new['id'] = df['id']
def remove(name):
new_name = re.sub("[0-9]", '', name)
new_name = ' '.join(new_name.split())
return new_name
df_new['name'] = df_new.loc[:,'name'].apply(lambda x: remove(x))
df_new.head()
tfidf_vectorizer = TfidfVectorizer(
use_idf=True,
stop_words = 'english',
ngram_range=(1,4), min_df = 0.01, max_df = 0.8)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_new['name'])
print (tfidf_matrix.shape)
print (tfidf_vectorizer.get_feature_names())
from sklearn.metrics.pairwise import cosine_similarity
dist = 1.0 - cosine_similarity(tfidf_matrix)
print (dist)
from scipy.cluster.hierarchy import ward, dendrogram
#run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances
fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix,
truncate_mode='lastp', # show only the last p merged clusters
p=20, # show only the last p merged clusters
leaf_rotation=90.,
leaf_font_size=12.,
labels=list(df_new['name']))
plt.axhline(y=20, linewidth = 2, color = 'black')
fig.suptitle("Hierarchial Clustering Dendrogram Truncated", fontsize = 35, fontweight = 'bold')
#fig.show()
from sklearn.cluster import KMeans
num_clusters = range(1,20)
KM = [KMeans(n_clusters=k, random_state = 1).fit(tfidf_matrix) for k in num_clusters]
# Let's plot the within cluster sum of squares for each k to see which k I should choose.
#
# The plot shows a steady decline from from 0 to 19. Since the elbow rule does not apply for this I will choose k = 7 because of the previous dendrogram.
# In[17]:
import matplotlib.pyplot as plt
#get_ipython().run_line_magic('matplotlib', 'inline')
with_in_cluster = [KM[k].inertia_ for k in range(0,len(num_clusters))]
plt.plot(num_clusters, with_in_cluster)
plt.ylim(min(with_in_cluster)-1000, max(with_in_cluster)+1000)
plt.ylabel('with-in cluster sum of squares')
plt.xlabel('# of clusters')
plt.title('kmeans within ss for k value')
plt.show()
# I add the cluster label to each record in df_new
# In[18]:
model = KM[6]
clusters = model.labels_.tolist()
df_new['cluster'] = clusters
# Here is the distribution of clusters. Cluster 0 has a records, then cluster 1. Cluster 2 - 4 seem pretty even.
# In[19]:
df_new['cluster'].value_counts()
# I print the top terms per cluster and the names in the respective cluster.
# In[20]:
print("Top terms per cluster:")
print
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names()
for i in range(model.n_clusters):
print ("Cluster %d : " %i )
for ind in order_centroids[i, :10]:
print ( '%s' % terms[ind])
print
print ("Cluster %d names:" %i)
for idx in df_new[df_new['cluster'] == i]['name'].sample(n = 10):
print ( ' %s' %idx)
print
print
# I reduce the dist to 2 dimensions with MDS. The dissimilarity is precomputed because we provide 1 - cosine similarity. Then I assign the x and y variables.
# In[21]:
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.manifold import MDS
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)
xs, ys = pos[:, 0], pos[:, 1]
# In[22]:
cluster_colors = {0: '#85C1E9', 1: '#FF0000', 2: '#800000', 3: '#04B320',
4: '#6033FF', 5: '#33FF49', 6: '#F9E79F', 7: '#935116',
8: '#9B59B6', 9: '#95A5A6'}
cluster_labels = {0: 'vest dress print', 1: 'shirt merino island',
2: 'pants guide pants guide', 3: 'shorts board board shorts',
4: 'simply live live simply', 5: 'cap cap bottoms bottoms',
6: 'jkt zip jkt guide'}
#some ipython magic to show the matplotlib plots inline
#get_ipython().run_line_magic('matplotlib', 'inline')
#create data frame that has the result of the MDS plus the cluster numbers and titles
df_plot = pd.DataFrame(dict(x=xs, y=ys, label=clusters, name=df_new['name']))
#group by cluster
groups = df_plot.groupby('label')
# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12,
label = cluster_labels[name],
color = cluster_colors[name])
ax.set_aspect('auto')
ax.legend(numpoints = 1)
fig.suptitle("SKU Clustering", fontsize = 35, fontweight = 'bold')
#plt.show()