from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.08, max_features=200,
min_df=0.02, stop_words='english',
use_idf=True, ngram_range=(1,3),tokenizer = tokenize_only_subject, analyzer='word')
tfidf_vectorizer.fit(enron_data["headers.Subject"])
tfidf_matrix_subject = tfidf_vectorizer.fit_transform(enron_data["headers.Subject"])
print "\n\nshape of tfidf :\t",(tfidf_matrix_subject.shape)
terms_subject = tfidf_vectorizer.get_feature_names()
print "\n Feature's selected by machine from tdifd for Subject :\t",terms_subject
x =tfidf_matrix_subject.toarray()
#
#######################################################################################
from sklearn.metrics.pairwise import cosine_similarity
distance = 1 - cosine_similarity(tfidf_matrix_subject)
print "+++distance\t:",distance[:5]
from sklearn.cluster import KMeans
num_clusters = 4
km = KMeans(n_clusters=num_clusters)
print ":",km.fit_transform(tfidf_matrix_subject).shape
centroids = km.cluster_centers_
labels = km.labels_
print "Centroid is:\t",centroids
print "Labels is :\t",labels
n_clusters_ = km.labels_
print "++++++++++++++++++++++++++++++++++++++++++++++\n",n_clusters_
enron_cls = { 'enron_data_body': enron_data["body"],'enron_data_Subject': enron_data["headers.Subject"],'_id_':enron_data["_id"],"Date":enron_data["Date"],'cluster_': n_clusters_}
frame = pd.DataFrame(enron_cls, index = [n_clusters_] , columns = ['_id_','enron_data_body','enron_data_Subject','Date','cluster_'])
print frame.head()
frame.to_csv("errror.csv")
我需要有关群集的指导或帮助。 它给出了重复的价值;例如,原始数据集的第四行重复与群集的数据集计数一样多的次数。 我希望对每一行进行聚类,而不是重复原始数据集。