以下是关于示例lda文本数据的主题建模的代码。任何人都可以帮我为csv文件做同样的事情。
from __future__ import division, print_function
import numpy as np
import lda
import csv
import os
import lda.datasets
with open(os.path.join('c:\users\kaila\jobdescription.csv'), "r") as csvfile:
X = csv.reader(csvfile, delimiter=' ', quotechar='|')
X = lda.datasets.load_reuters()
print("type(X): {}".format(type(X)))
vocab = lda.datasets.load_reuters_vocab()
print("type(vocab): {}".format(type(vocab)))
titles = lda.datasets.load_reuters
X = lda.datasets.load_reuters()
print("type(X): {}".format(type(X)))
print("shape: {}\n".format(X.shape))
vocab = lda.datasets.load_reuters_vocab()
print("type(vocab): {}".format(type(vocab)))
print("len(vocab): {}\n".format(len(vocab)))
titles = lda.datasets.load_reuters()
titles = lda.datasets.load_reuters_titles()
print("type(titles): {}".format(type(titles)))
print("len(titles): {}\n".format(len(titles)))
doc_id = 0
word_id = 3117
print("doc id: {} word id: {}".format(doc_id, word_id))
print("-- count: {}".format(X[doc_id, word_id]))
print("-- word: {}".format(vocab[word_id]))
print("-- doc: {}".format(titles[doc_id]))
model = lda.LDA(n_topics = 20, n_iter = 500, random_state = 1)
model.fit(X)
topic_word = model.topic_word_
print("type(topic_word): {}".format(type(topic_word)))
print("shape: {}".format(topic_word.shape))
for n in range(5):
sum_pr = sum(topic_word[n,:])
print("topic: {} sum: {}".format(n, sum_pr))
n = 5
for i, topic_dist in enumerate(topic_word):
topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))
doc_topic = model.doc_topic_
print("type(doc_topic): {}".format(type(doc_topic)))
print("shape: {}".format(doc_topic.shape))
for n in range(5):
sum_pr = sum(doc_topic[n,:])
print("document: {} sum: {}".format(n, sum_pr))
for n in range(10):
topic_most_pr = doc_topic[n].argmax()
print("doc: {} topic: {}\n{}...".format(n, topic_most_pr, titles[n] [:50]))
import matplotlib.pyplot as plt
try:
plt.style.use('ggplot')
except:
pass
f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
for i, k in enumerate([0, 5, 9, 14, 19]):
ax[i].stem(topic_word[k,:], linefmt='b-', markerfmt='bo', basefmt='w-')
ax[i].set_xlim(-50, 4350)
ax[i].set_ylim(0, 0.08)
ax[i].set_ylabel("Prob")
ax[i].set_title("topic {}".format(k))
ax[4].set_xlabel("word")
plt.tight_layout()
plt.show()
f, ax = plt.subplots(5, 1, figsize=(8, 6), sharex = True)
for i, k in enumerate([1, 3, 4, 8, 9]):
ax[i].stem(doc_topic[k,:], linefmt='r-', marketfmt='ro', basefmt='w-')
ax[i].set_xlim(-1, 21)
ax[i].set_ylim(0, 1)
ax[i].set_ylabel("Prob")
ax[i].set_title("Document {}".format(k))
ax[4].set_xlabel("Topic")
plt.tight_layout()
plt.show()
如果有人解决这个问题会很好,因为我一直在努力在csv文件上应用这种技术。