text2vec包中的LDA主题建模非常棒。它确实比topicmodel快得多
但是,我不知道如何获得每个文档属于每个主题的概率,如下例所示:
V1 V2 V3 V4
1 0.001025237 7.89E-05 7.89E-05 7.89E-05
2 0.002906977 0.002906977 0.014534884 0.002906977
3 0.003164557 0.003164557 0.003164557 0.003164557
4 7.21E-05 7.21E-05 0.000360334 7.21E-05
5 0.000804433 8.94E-05 8.94E-05 8.94E-05
6 5.63E-05 5.63E-05 5.63E-05 5.63E-05
7 0.001984127 0.001984127 0.001984127 0.001984127
8 0.003515625 0.000390625 0.000390625 0.000390625
9 0.000748503 0.000748503 0.003742515 0.003742515
10 0.000141723 0.00297619 0.000141723 0.000708617
这是text2vec lda
的代码ss2 <- as.character(stressor5$weibo)
seg2 <- mmseg4j(ss2)
# Create vocabulary. Terms will be unigrams (simple words).
it_test = itoken(seg2, progressbar = FALSE)
vocab2 <- create_vocabulary(it_test)
pruned_vocab2 = prune_vocabulary(vocab2,
term_count_min = 10,
doc_proportion_max = 0.5,
doc_proportion_min = 0.001)
vectorizer2 <- vocab_vectorizer(pruned_vocab2)
dtm_test = create_dtm(it_test, vectorizer2)
lda_model = LDA$new(n_topics = 1000, vocabulary = vocab2, doc_topic_prior = 0.1, topic_word_prior = 0.01)
doc_topic_distr = lda_model$fit_transform(dtm_test, n_iter = 1000, convergence_tol = 0.01, check_convergence_every_n = 10)
答案 0 :(得分:6)
doc_topic_distr
是一个矩阵,其中包含文档中的单词分配给特定主题的次数。所以你只需要按字数标准化每一行(你也可以在规范化之前添加doc_topic_prior
)。
library(text2vec)
data("movie_review")
tokens = movie_review$review %>%
tolower %>%
word_tokenizer
# turn off progressbar because it won't look nice in rmd
it = itoken(tokens, ids = movie_review$id, progressbar = FALSE)
v = create_vocabulary(it) %>%
prune_vocabulary(term_count_min = 10, doc_proportion_max = 0.2)
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it, vectorizer, type = "lda_c")
doc_topic_prior = 0.1
lda_model =
LDA$new(n_topics = 10, vocabulary = v,
doc_topic_prior = doc_topic_prior, topic_word_prior = 0.01)
doc_topic_distr =
lda_model$fit_transform(dtm, n_iter = 1000, convergence_tol = 0.01,
check_convergence_every_n = 10)
head(doc_topic_distr)
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#5814_8 16 18 0 34 0 16 49 0 20 23
#2381_9 4 0 6 20 0 0 6 6 0 28
#7759_3 21 39 7 0 3 47 0 25 21 17
#3630_4 18 7 22 14 19 0 18 0 2 35
#9495_8 4 0 13 17 13 78 3 2 28 25
#8196_8 0 0 0 11 0 8 0 8 8 0
doc_topic_prob = normalize(doc_topic_distr, norm = "l1")
# or add norm first and normalize :
# doc_topic_prob = normalize(doc_topic_distr + doc_topic_prior, norm = "l1")