如何使用LDA主题模型创建主题名称

时间:2017-05-15 02:22:17

标签: python nlp nltk lda topic-modeling

我正在研究python中的LDA主题模型,它提供了以下主题的输出:

(0, u'0.559*"delivery" + 0.124*"area" + 0.018*"mile" + 0.016*"option" + 0.012*"partner" + 0.011*"traffic" + 0.011*"hub" + 0.011*"thanks" + 0.010*"city" + 0.009*"way"')
(1, u'0.397*"package" + 0.073*"address" + 0.055*"time" + 0.047*"customer" + 0.045*"apartment" + 0.037*"delivery" + 0.031*"number" + 0.026*"item" + 0.021*"support" + 0.018*"door"')
(2, u'0.190*"time" + 0.127*"order" + 0.113*"minute" + 0.075*"pickup" + 0.074*"restaurant" + 0.031*"food" + 0.027*"support" + 0.027*"delivery" + 0.026*"pick" + 0.018*"min"')
(3, u'0.072*"code" + 0.067*"gps" + 0.053*"map" + 0.050*"street" + 0.047*"building" + 0.043*"address" + 0.042*"navigation" + 0.039*"access" + 0.035*"point" + 0.028*"gate"')
(4, u'0.434*"hour" + 0.068*"time" + 0.034*"min" + 0.032*"amount" + 0.024*"pay" + 0.019*"gas" + 0.018*"road" + 0.017*"today" + 0.016*"traffic" + 0.014*"load"')
(5, u'0.245*"route" + 0.154*"warehouse" + 0.043*"minute" + 0.039*"need" + 0.039*"today" + 0.026*"box" + 0.025*"facility" + 0.025*"bag" + 0.022*"end" + 0.020*"manager"')
(6, u'0.371*"location" + 0.110*"pick" + 0.097*"system" + 0.040*"im" + 0.038*"employee" + 0.022*"evening" + 0.018*"issue" + 0.015*"request" + 0.014*"while" + 0.013*"delivers"')
(7, u'0.182*"schedule" + 0.181*"please" + 0.059*"morning" + 0.050*"application" + 0.040*"payment" + 0.026*"change" + 0.025*"advance" + 0.025*"slot" + 0.020*"date" + 0.020*"tomorrow"')
(8, u'0.138*"stop" + 0.110*"work" + 0.062*"name" + 0.055*"account" + 0.046*"home" + 0.043*"guy" + 0.030*"address" + 0.026*"city" + 0.025*"everything" + 0.025*"feature"') 

模型是否有一种方法可以根据每个主题中的功能/单词自动为上述主题列表创建人类可读的主题名称(而不是主题编号)?我不想手动为每个0-9主题创建主题名称。

我按如下方式创建LDA模型:

import gensim
import json
import pandas as pd
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import re
from nltk.tokenize import RegexpTokenizer
import pyLDAvis.gensim as gensimvis
import pyLDAvis
from gensim import corpora
from gensim import corpora, models, similarities 
import xlrd
from collections import OrderedDict
import simplejson as json

wb = xlrd.open_workbook("excel.xlsx")
sh = wb.sheet_by_index(0)

feedback_list = []

for rownum in range(1, sh.nrows):
    feedback = OrderedDict()
    row_values = sh.row_values(rownum)
    feedback['timestamp'] = row_values[0]
    feedback['text'] = row_values[1]
    feedback['header'] = row_values[2]
    feedback['transporter'] = row_values[3]
    feedback['device-type'] = row_values[4]
    feedback['app-version'] = row_values[5]
    feedback['locale'] = row_values[6]
    feedback['company-type'] = row_values[7]
    feedback['detected-language'] = row_values[8]  

    feedback_list.append(feedback)

j = json.dumps({'feedback': feedback_list})

with open('data.json', 'w') as f:
    f.write(j)  

data_file = "data.json"  
pd.read_json(data_file, typ = "series")


with open(data_file, "rb") as f:
    data = f.readlines()

# Reading the data to a dataframe

data_json_str = "["+','.join(data) + "]"
data_df = pd.read_json(data_json_str)

num_reviews_tpadv = len(data_df["feedback"][0])
all_reviews = []

# Adding all the reviews to all_reviews list 

for i in range(num_reviews_tpadv):
    all_reviews.append(data_df["feedback"][0][i]["text"])

stopwords = {}
with open('stopwords.txt', 'rU') as f:
    for line in f:
        stopwords[line.strip()] = 1

def clean_review(text):

    words = []
    nouns = []
    if type(text) != int:
        new_text = text.lower()
        sentences = nltk.sent_tokenize(new_text)

        for sentence in sentences:
            tokens = nltk.word_tokenize(sentence)
            text1 = [word for word in tokens if word not in stopwords]
            tagged_text = nltk.pos_tag(text1)

            for word, tag in tagged_text:
                words.append({"word": word, "pos": tag})       

        lem = WordNetLemmatizer()

        words = [word for word in words if word["pos"] in ["NN", "NNS"]]

        for word in words:
            nouns.append(lem.lemmatize(word["word"]))

    return nouns          

clean_reviews = []

for i in range(num_reviews_tpadv):
    clean_reviews.append(clean_review(data_df["feedback"][0][i]["text"]))

#----------------------------------------------------
# Creating Dictionary and Corpus to train LDA model
#----------------------------------------------------

dictionary = corpora.Dictionary(clean_reviews)
dictionary.filter_extremes(keep_n=11000) #change filters
dictionary.compactify()
dictionary_path = "dictionary.dict"
corpora.Dictionary.save(dictionary, dictionary_path)

corpus = [dictionary.doc2bow(doc) for doc in clean_reviews]

# Training lda using number of topics set = 10 (which can be changed)

lda = gensim.models.LdaModel(corpus, id2word = dictionary,
                        num_topics = 20,
                        passes = 20,
                        random_state=1,
                        alpha = "auto")
lda_model_path = "lda_model.lda"
lda.save(lda_model_path)

i = 0
for topic in lda.show_topics(20):
    print topic

    i += 1

0 个答案:

没有答案