使用csv文件作为输入应用主题建模LDA

时间:2016-09-27 08:21:40

标签: csv lda topic-modeling

以下是关于示例lda文本数据的主题建模的代码。任何人都可以帮我为csv文件做同样的事情。

from __future__ import division, print_function
import numpy as np
import lda
import csv
import os
import lda.datasets
with open(os.path.join('c:\users\kaila\jobdescription.csv'), "r") as csvfile:
    X = csv.reader(csvfile, delimiter=' ', quotechar='|')

X = lda.datasets.load_reuters()
print("type(X): {}".format(type(X)))

vocab = lda.datasets.load_reuters_vocab()
print("type(vocab): {}".format(type(vocab)))

titles = lda.datasets.load_reuters
X = lda.datasets.load_reuters()
print("type(X): {}".format(type(X)))

print("shape: {}\n".format(X.shape))

vocab = lda.datasets.load_reuters_vocab()
print("type(vocab): {}".format(type(vocab)))

print("len(vocab): {}\n".format(len(vocab)))

titles = lda.datasets.load_reuters()
titles = lda.datasets.load_reuters_titles()
print("type(titles): {}".format(type(titles)))

print("len(titles): {}\n".format(len(titles)))

doc_id = 0
word_id = 3117
print("doc id: {} word id: {}".format(doc_id, word_id))

print("-- count: {}".format(X[doc_id, word_id]))

print("-- word: {}".format(vocab[word_id]))

print("-- doc: {}".format(titles[doc_id]))

model = lda.LDA(n_topics = 20, n_iter = 500, random_state = 1)
model.fit(X)

topic_word = model.topic_word_
print("type(topic_word): {}".format(type(topic_word)))

print("shape: {}".format(topic_word.shape))

for n in range(5):
    sum_pr = sum(topic_word[n,:])
    print("topic: {} sum: {}".format(n, sum_pr))

n = 5
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
    print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))

doc_topic = model.doc_topic_
print("type(doc_topic): {}".format(type(doc_topic)))

print("shape: {}".format(doc_topic.shape))

for n in range(5):
    sum_pr = sum(doc_topic[n,:])
    print("document: {} sum: {}".format(n, sum_pr))

for n in range(10):
    topic_most_pr = doc_topic[n].argmax()
    print("doc: {} topic: {}\n{}...".format(n, topic_most_pr, titles[n]   [:50]))

import matplotlib.pyplot as plt
try:
    plt.style.use('ggplot')

except:
    pass

f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)

for i, k in enumerate([0, 5, 9, 14, 19]):
    ax[i].stem(topic_word[k,:], linefmt='b-', markerfmt='bo', basefmt='w-')
    ax[i].set_xlim(-50, 4350)
    ax[i].set_ylim(0, 0.08)
    ax[i].set_ylabel("Prob")
    ax[i].set_title("topic {}".format(k))

ax[4].set_xlabel("word")

plt.tight_layout()
plt.show()
f, ax = plt.subplots(5, 1, figsize=(8, 6), sharex = True)
for i, k in enumerate([1, 3, 4, 8, 9]):
    ax[i].stem(doc_topic[k,:], linefmt='r-', marketfmt='ro', basefmt='w-')
    ax[i].set_xlim(-1, 21)
    ax[i].set_ylim(0, 1)
    ax[i].set_ylabel("Prob")
    ax[i].set_title("Document {}".format(k))

ax[4].set_xlabel("Topic")

plt.tight_layout()
plt.show()

如果有人解决这个问题会很好,因为我一直在努力在csv文件上应用这种技术。

0 个答案:

没有答案