我想在层次聚类过程中自动化阈值过程,我想做的是,而不是手动输入阈值,如何在30到50范围内检查群集是否在群集不在范围内30-50,通过代码更改阈值,在python中更改0.1或0.2
import pickle
import re
import string
import sys
# import gensim
# from gensim import corpora
from time import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.cluster.hierarchy as sch
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from stop_word_complaints import complaint_stop_words
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.95, min_df=1, token_pattern=r'\b\w+\b',
max_features=n_features, stop_words=list(stop), analyzer='word')
X = tfidf_vectorizer.fit_transform(corpus).toarray()
non_zero_features = np.where(np.sum(X, axis=1) != 0)[0]
print("done in %0.3fs." % (time() - t0))
print("pdist ...")
t0 = time()
cos_dist = pdist(X[non_zero_features, :], 'cosine')
print("done in %0.3fs." % (time() - t0))
dists = np.asarray(squareform(cos_dist))
dists[np.isnan(dists)] = 1
# cos_dist[np.isnan(cos_dist)] = 0
# dists[np.argwhere(np.isnan(dists))] = 1
print("linkage ...")
np.savetxt(str_path + "_dist_1.csv", dists, delimiter=',')
# pickle.dump(dists, open(str_path + "_dist.p", "wb"))
t0 = time()
linkage_matrix = linkage(dists, "average")
print("done in %0.3fs." % (time() - t0))
np.savetxt(str_path + "linkage_matrix.csv", linkage_matrix, delimiter=',')
# linkage_matrix = np.loadtxt(str_path + "linkage_matrix.csv", delimiter=',')
# pickle.dump(linkage_matrix, open(str_path + "linkage_matrix.p", "wb"))
dendrogram(linkage_matrix)
# create figure & 1 axis
fig, ax = plt.subplots(nrows=1, ncols=1) # create figure & 1 axis
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
linkage_matrix
# leaf_rotation=90., # rotates the x axis labels
# leaf_font_size=3., # font size for the x axis labels
)
plt.show()
fig.savefig(str_path + 'Agglo_Heirachy_dendo.png') # save the figure to file
min_th = min(linkage_matrix[:,2])
max_th = max(linkage_matrix[:,2])
clusters = get_clusters(linkage_matrix, min_th, max_th)
答案 0 :(得分:0)
我终于得到了解决方案,即我已经定义了新功能,其中我获得了范围内所需的聚类
def get_clusters(linkage_matrix, min_th, max_th):
while (True):
print("----------------\n")
th = min_th + (max_th - min_th) / 2
clusters = sch.fcluster(linkage_matrix, th, 'distance')
if max(clusters) >= 30 and max(clusters) <= 50:
print("Clusters found: %d" % max(clusters))
return clusters
elif max(clusters) > 50:
min_th = th
print("Clusters found: %d" % max(clusters))
continue
elif max(clusters) < 30:
max_th = th
print("Clusters found: %d" % max(clusters))
continue