Question

我从数据库中获取不同的文档，并且我使用LDA（gensim）查看，这些文档中有哪些潜在的主题。这非常有效。我想要做的是在每个文档的数据库中保存最可能的主题。我不确定什么是最好的解决方案。例如，我可以在开头从数据库中提取每个文档的唯一ID以及text_column，并以某种方式处理它，我知道最后哪个id属于哪个主题编号。或者可能是我应该在最后一部分，我打印文档及其主题。但我不知道如何将其连接回数据库。通过text_column与文档的比较并分配相应的主题编号？对任何评论都会感激不尽。

stop = stopwords.words('english')

sql = """SELECT text_column FROM table where NULLIF(text_column, '') IS NOT NULL;"""
cur.execute(sql)
dbrows = cur.fetchall()
conn.commit()

documents = []
    for i in dbrows:
    documents = documents + list(i)

# remove all the words from the stoplist and tokenize
stoplist = stopwords.words('english')

additional_list = set("``;''".split(";"))

texts = [[word.lower() for word in document.split() if word.lower() not                 in stoplist and word not in string.punctuation and word.lower() not in additional_list] 
     for document in documents]

# remove words that appear less or equal of 2 times
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) <= 2)
texts = [[word for word in text if word not in tokens_once]
     for text in texts]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
my_num_topics = 10

# lda itself
lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=my_num_topics)
corpus_lda = lda[corpus]

# print the most contributing words for selected topics
for top in lda.show_topics(my_num_topics):
    print top

# print the most probable topic and the document
for l,t in izip(corpus_lda,documents):
    selected_topic = max(l,key=lambda item:item[1])
    if selected_topic[1] != 1/my_num_topics:
        selected_topic_number = selected_topic[0]
        print selected_topic
        print t

Answer 1

正如wildplasser评论的那样，我只需要选择id和text_column。我之前尝试过，但是按照我将数据附加到列表的方式，它不适合进一步处理。下面的代码可以工作，因此会创建一个包含id和一些最可能主题的表。

stop = stopwords.words('english')

sql = """SELECT id, text_column FROM table where NULLIF(text_column, '') IS NOT NULL;"""
cur.execute(sql)
dbrows = cur.fetchall()
conn.commit()

documents = []
    for i in dbrows:
    documents.append(i)

# remove all the words from the stoplist and tokenize
stoplist = stopwords.words('english')

additional_list = set("``;''".split(";"))

texts = [[word.lower() for word in document[1].split() if word.lower() not                 in stoplist and word not in string.punctuation and word.lower() not in additional_list] 
 for document in documents]

# remove words that appear less or equal of 2 times
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) <= 2)
texts = [[word for word in text if word not in tokens_once]
 for text in texts]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
my_num_topics = 10

# lda itself
lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=my_num_topics)
corpus_lda = lda[corpus]

# print the most contributing words for selected topics
for top in lda.show_topics(my_num_topics):
    print top

# print the most probable topic and the document
lda_topics = []
for l,t in izip(corpus_lda,documents):
    selected_topic = max(l,key=lambda item:item[1])
    if selected_topic[1] != 1/my_num_topics:
        selected_topic_number = selected_topic[0]
        lda_topics.append((selected_topic[0],int(t[0])))

cur.execute("""CREATE TABLE table_topic (id bigint PRIMARY KEY, topic int);""")
for j in lda_topics:
    my_id = j[1]
    topic = j[0]
    cur.execute("INSERT INTO table_topic (id, topic) VALUES (%s, %s)", (my_id,topic))
    conn.commit()

LDA gensim。如何使用每个文档的正确主题编号更新Postgres数据库？

1 个答案: