Y = data
cv = CountVectorizer(max_features=10, stop_words = my_stopwords, ngram_range=(1, 2), max_df=0.98)
cv_X = cv.fit_transform(Y)
word_col = cv.get_feature_names()
word_col[:10]
lda_params = {'n_topics':[1]}
lda = LatentDirichletAllocation()
lda_grid = GridSearchCV(lda, lda_params)
lda_grid.fit(cv_X)
lda_grid.best_estimator_
lda_model = LatentDirichletAllocation(n_topics=5, n_jobs=-1).fit(cv_X)
doctopic = lda_model.fit_transform(cv_X)
def topic_TopWords(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
print ("\n Topic {}: \n".format(topic_idx+1))
top_words = [feature_names[i] for i in topic.argsort()[::-1][:n_top_words]]
print (' '.join(sorted(top_words,key=len,reverse=True)))
topic_TopWords(lda_model,word_col,15)
mixture = [dict(lda_model[x]) for x in word_col]
pd.DataFrame(mixture).to_csv("output.csv")
我收到以下错误
TypeError:' LatentDirichletAllocation'对象不可订阅