我正在使用LDA在文本中查找主题。
import pandas
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
n_components = 5
n_top_words = 10
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
message = "Topic %d: " % topic_idx
message += " ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]])
print(message)
print()
df = pandas.read_csv('text.csv', encoding = 'utf-8')
text = df['a']
data_samples = text.values.tolist()
# Use tf (raw term count) features for LDA.
tf_vectorizer = CountVectorizer()
tf = tf_vectorizer.fit_transform(data_samples)
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
learning_method='online',
learning_offset=50.,
random_state=0)
lda.fit(tf)
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)
我的输出很好:
LDA模型中的主题:
主题0:订单产生不佳,收到的提前退货总是希望
主题1:然后木材之间的颜色间断交接变化到货差
主题2:送货产品可能的包裹存储提前破损的日期很好
主题3:误导性的产品法国模式打破了开年研究协会
主题4:地址传送更改发票发送遗失,请提前付款
但是我希望将此输出与大熊猫一起写在一个csv文件中。
Topic 0 Topic 1 Topic 2 ...
order advance ... ...
not return ... ...
produced always ... ...
well wishes ... ...
received hello ... ...
有可能吗?
答案 0 :(得分:1)
LDA模型中的主题:
主题0:订单产生不佳,收到的提前退货总是希望
主题1:然后木材之间的颜色间断交接变化到货差
主题2:送货产品可能的包裹存储提前破损的日期很好
主题3:误导性的产品法国模式打破了开年研究协会
主题4:地址传送更改发票发送遗失,请提前付款
df.to_csv("filename.csv")
答案 1 :(得分:1)
def print_top_words(model, feature_names, n_top_words):
out_list = []
for topic_idx, topic in enumerate(model.components_):
message = "Topic%d: " % topic_idx
message += " ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]])
out_list.append(message.split())
print(message)
print()
return outlist
...
df_ = print_top_words(lda, tf_feature_names, n_top_words)
df_ = pd.DataFrame(df_).T
df_.to_csv('filename.csv')