我使用KMeans进行聚类,如下所示,但是我不知道在散点图中绘制我的聚类。 或者也喜欢这个情节 我的代码是:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
documents = ["This little kitty came to play when I was eating at a restaurant.",
"Merley has the best squooshy kitten belly.",
"Google Translate app is incredible.",
"If you open 100 tab in google you get a smileyface.",
"Best cat photo I've ever taken.",
"Climbing ninja cat.",
"Impressed with google map feedback.",
"Key promoter extension for Google Chrome."]
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
答案 0 :(得分:0)
如果我正确理解了您的问题,我认为您可能正在寻找类似的方法?转换为聚类距离空间后,我按标签绘制了数据颜色。
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import pandas as pd
documents = ["This little kitty came to play when I was eating at a restaurant.",
"Merley has the best squooshy kitten belly.",
"Google Translate app is incredible.",
"If you open 100 tab in google you get a smileyface.",
"Best cat photo I've ever taken.",
"Climbing ninja cat.",
"Impressed with google map feedback.",
"Key promoter extension for Google Chrome."]
df = pd.DataFrame(documents) # read in your data with pd.read_csv or if in list form like above do this
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df[0].values.astype('U')) # make sure you have unicode strings [0] is the column of the sentences
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=2000, n_init=20)
Xt = model.fit_transform(X)
# things with tf-idf score
X = X.toarray()
fns = np.array(vectorizer.get_feature_names()) # feature names/ordered by index
# retrieve labels with max score
labels = model.labels_
d = []
for n in sorted(np.unique(labels)):
t = X[(labels == n)].sum(axis=0) #max tf/idf score cumulative/cluster
words = fns[t == t.max()]
d.append(",".join(words))
t = Xt.T # cluster distance space X transpose to be plotted with mpl
### plot the clusters
fig, ax = plt.subplots(1,1)
cluster_color_dict = {0:'purple', 1 :'blue'} # change these to desired colors
for i in range(len(t[0])):
ax.scatter(t[0][i], t[1][i], c= cluster_color_dict[labels[i]], edgecolors='grey', lw = 0.5, s = 200)
p1 = [] # legend patches
for i in range(2):
print i
h = ax.scatter([],[], c= cluster_color_dict[i],
edgecolors= 'grey', lw = 0.5, s = 80, label = d[i])
p1.append(h)
l1 = ax.legend(handles = p1, title= 'cluster', bbox_to_anchor = (1,1), loc = 'upper left')