关于带有lda(koNLP)w / theta和doc.length的createjson的R

时间:2018-07-26 08:54:35

标签: r json lda

1。我想知道为什么theta大于doc_length。 2.它会导致此错误-> createJSON中的错误(phi = phi,theta = theta,vocab = 3. vocab,doc.length = doc_length,: 4. doc.length的长度不相等  5.到theta中的行数;两者应等于   6.文件中的数据。 7.它与数据清理有关吗? 8. Plz帮助、、、、

######make dictionary
dics <- c('sejong, woorimalsam')
category <- c('health general')

buildDictionary(ext_dic=dics, user_dic=user_d, replace_usr_dic=TRUE)

Cplus_txt <- file("path/arona_one.txt", encoding = "UTF-8")
Cplus_txt<-readLines(Cplus_txt)
row_sums(Cplus_txt)

Cplus_txt <- str_replace_all(Cplus_txt, "[[:punct:]]", "") %>%
  str_replace_all("[0-9]+", "") %>%
  str_replace_all("\\s+", " ") %>%
  str_replace_all("http[a-zA-Z0-9]+", "") %>%
  str_replace_all("[ㄱ-ㅎㅏ-ㅣ]+", "") %>%}

ko_words <- function(doc) {
  d <- as.character(doc)
  pos <- unlist(SimplePos22(d))

  extracted <- str_match(pos, '([가-힣]+)/[NP][A-Z]')

  keyword <- extracted[,2]
  keyword[!is.na(keyword)]
}


pos <- Map(ko_words, Cplus_txt)

corpus <- Corpus(VectorSource(pos))
#######

new_stop <- file("path/stopwords-ko.txt",encoding = "UTF-8")

#new_stop <- file("/path/stopwords-ko.txt",encoding = "UTF-8")
new_stop <- readLines(new_stop)
rm_words2 <- paste(new_stop, collapse = "|")

####
dtm <- DocumentTermMatrix(corpus, control=list(
  removePunctuation=TRUE,stopwords=new_stop,
  removeNumbers=TRUE, wordLengths=c(2, 20), weighting=weightTf))

dtm <- dtm[row_sums(dtm)>0, ]

q_model <- LDA(dtm, k=15, method="Gibbs", control=list(iter=2000))

q_topics <- topics(q_model, 1)
q_terms <- as.data.frame(terms(q_model, 20), stringsAsFactors=FALSE)
q_terms[1:5]
################################33
K <- 5
G <- 5000
alpha <- 0.02

fit <- LDA(dtm, k=K, method='Gibbs', control=list(iter=G, alpha=alpha))

phi <- posterior(fit)$terms %>% as.matrix
theta <- posterior(fit)$topics %>% as.matrix

###########################################3
vocab <- colnames(phi)
doc_length <- c()
for(i in 1:length(corpus)) {
  temp <- paste(corpus[[i]]$content, collapse=" ")
  doc_length <- c(doc_length, stri_count(temp, regex='\\S+'))}

######what i try
doc_length <- doc_length[doc_length >= 0]




temp_frequency <- as.matrix(dtm)
freq_matrix <- data.frame(ST=colnames(temp_frequency),
                      Freq=colSums(temp_frequency))

json_lda <- createJSON(phi=phi,
                   theta=theta,
                   vocab=vocab,
                   doc.length=doc_length,
                   term.frequency=freq_matrix$Freq)

serVis(json_lda, out.dir='2017-08-15-complaint-vis', open.browser=FALSE)

0 个答案:

没有答案