df = pd.DataFrame({'text':["Anyone who reads Old and Middle English literary texts will be familiar with the mid-brown volumes of the EETS, with the symbol of Alfreds jewel embossed on the front cover",
"Most of the works attributed to King Alfred or to Aelfric, along with some of those by bishop Wulfstan and much anonymous prose and verse from the pre-Conquest period, are to be found within the Society's three series",
"all of the surviving medieval drama, most of the Middle English romances, much religious and secular prose and verse including the English works of John Gower, Thomas Hoccleve and most of Caxton's prints all find their place in the publications",
"Without EETS editions, study of medieval English texts would hardly be possible."]})
tokens = [['middl engl', 'mid-brown', 'symbol'], ["king", 'anonym', 'series'], ['mediev', 'romance', 'relig'], ['hocclev', 'edit', 'publ']]
正如我先前所说,这篇文章只是我的问题的一个例子。我正在解决聚类问题。我使用LDA和K-means算法来做到这一点。要为我的代币列表找到最合适的句子,我使用 K-means距离参数。
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import lda
from sklearn.feature_extraction.text import CountVectorizer
import logging
from sklearn.cluster import MiniBatchKMeans
from sklearn import preprocessing
df = pd.DataFrame({'text':["Anyone who reads Old and Middle English literary texts will be familiar with the mid-brown volumes of the EETS, with the symbol of Alfreds jewel embossed on the front cover",
"Most of the works attributed to King Alfred or to Aelfric, along with some of those by bishop Wulfstan and much anonymous prose and verse from the pre-Conquest period, are to be found within the Society's three series",
"all of the surviving medieval drama, most of the Middle English romances, much religious and secular prose and verse including the English works of John Gower, Thomas Hoccleve and most of Caxton's prints all find their place in the publications",
"Without EETS editions, study of medieval English texts would hardly be possible."],
'tokens':[['middl engl', 'mid-brown', 'symbol'], ["king", 'anonym', 'series'], ['mediev', 'romance', 'relig'], ['hocclev', 'edit', 'publ']]})
df['tokens'] = df.tokens.str.join(',')
vectorizer = TfidfVectorizer(min_df=1, max_features=10000, ngram_range=(1, 2))
vz = vectorizer.fit_transform(df['tokens'])
cvectorizer = CountVectorizer(min_df=1, max_features=10000, ngram_range=(1,2))
cvz = cvectorizer.fit_transform(df['tokens'])
n_topics = 4
n_iter = 2000
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)
num_clusters = 4
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose=False, max_iter=1000)
kmeans = kmeans_model.fit(vz)
kmeans_clusters = kmeans.predict(vz)
kmeans_distances = kmeans.transform(vz)
X_all = X_topics
kmeans1 = kmeans_model.fit(X_all)
kmeans_clusters1 = kmeans1.predict(X_all)
kmeans_distances1 = kmeans1.transform(X_all)
d = dict()
l = 1
for i, desc in enumerate(df.text):
if(i < 3):
num = 3
if kmeans_clusters1[i] == num:
if l > kmeans_distances1[i][kmeans_clusters1[i]]:
l = kmeans_distances1[i][kmeans_clusters1[i]]
d['Cluster' + str(kmeans_clusters1[i])] = "distance: " + str(l)+ " "+ df.iloc[i]['text']
print("Cluster " + str(kmeans_clusters1[i]) + ": " + desc +
"(distance: " + str(kmeans_distances1[i][kmeans_clusters1[i]]) + ")")
print("Cluster " + str(num) + " " + str(d.get('Cluster' + str(num))))