# I am importing all the packages necessary for topic modelling in this step
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
import gensim
from gensim import corpora, models
import os
from os import path
from time import sleep
import matplotlib.pyplot as plt
import random
import wordcloud
from wordcloud import WordCloud, STOPWORDS
tokenizer = RegexpTokenizer(r'\w+')
en_stop = set(get_stop_words('en'))
with open(os.path.join('D:\Users\kaila\jobdescription.txt')) as f:
Reader = f.read()
# I am replacing all the unnecessary words for wordcloud analysis
Reader = Reader.replace("will", " ")
Reader = Reader.replace("experience", " ")
Reader = Reader.replace("work", " ")
Reader = Reader.replace("years", " ")
Reader = Reader.replace("please", " ")
Reader = Reader.replace("apply", " ")
texts = unicode(Reader, errors='replace')
tdm = []
raw = texts.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
tdm.append(stopped_tokens)
dictionary = corpora.Dictionary(tdm)
corpus = [dictionary.doc2bow(i) for i in tdm]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word = dictionary)
for t in range(ldamodel.num_topics):
print(ldamodel)
print(ldamodel.print_topics(num_topics=5, num_words=8))
plt.figure()
plt.imshow(WordCloud().fit_words(ldamodel.show_topic(t, 200)))
plt.axis("off")
plt.title("Topic #" + str(t))
plt.show()
通过这样做,我得到了主题模型和wordclouds,但结果彼此非常相似。有什么我可以做的,以获得独特的主题模型和词云结果?