1。我想知道为什么theta大于doc_length。 2.它会导致此错误-> createJSON中的错误(phi = phi,theta = theta,vocab = 3. vocab,doc.length = doc_length,: 4. doc.length的长度不相等 5.到theta中的行数;两者应等于 6.文件中的数据。 7.它与数据清理有关吗? 8. Plz帮助、、、、
######make dictionary
dics <- c('sejong, woorimalsam')
category <- c('health general')
buildDictionary(ext_dic=dics, user_dic=user_d, replace_usr_dic=TRUE)
Cplus_txt <- file("path/arona_one.txt", encoding = "UTF-8")
Cplus_txt<-readLines(Cplus_txt)
row_sums(Cplus_txt)
Cplus_txt <- str_replace_all(Cplus_txt, "[[:punct:]]", "") %>%
str_replace_all("[0-9]+", "") %>%
str_replace_all("\\s+", " ") %>%
str_replace_all("http[a-zA-Z0-9]+", "") %>%
str_replace_all("[ㄱ-ㅎㅏ-ㅣ]+", "") %>%}
ko_words <- function(doc) {
d <- as.character(doc)
pos <- unlist(SimplePos22(d))
extracted <- str_match(pos, '([가-힣]+)/[NP][A-Z]')
keyword <- extracted[,2]
keyword[!is.na(keyword)]
}
pos <- Map(ko_words, Cplus_txt)
corpus <- Corpus(VectorSource(pos))
#######
new_stop <- file("path/stopwords-ko.txt",encoding = "UTF-8")
#new_stop <- file("/path/stopwords-ko.txt",encoding = "UTF-8")
new_stop <- readLines(new_stop)
rm_words2 <- paste(new_stop, collapse = "|")
####
dtm <- DocumentTermMatrix(corpus, control=list(
removePunctuation=TRUE,stopwords=new_stop,
removeNumbers=TRUE, wordLengths=c(2, 20), weighting=weightTf))
dtm <- dtm[row_sums(dtm)>0, ]
q_model <- LDA(dtm, k=15, method="Gibbs", control=list(iter=2000))
q_topics <- topics(q_model, 1)
q_terms <- as.data.frame(terms(q_model, 20), stringsAsFactors=FALSE)
q_terms[1:5]
################################33
K <- 5
G <- 5000
alpha <- 0.02
fit <- LDA(dtm, k=K, method='Gibbs', control=list(iter=G, alpha=alpha))
phi <- posterior(fit)$terms %>% as.matrix
theta <- posterior(fit)$topics %>% as.matrix
###########################################3
vocab <- colnames(phi)
doc_length <- c()
for(i in 1:length(corpus)) {
temp <- paste(corpus[[i]]$content, collapse=" ")
doc_length <- c(doc_length, stri_count(temp, regex='\\S+'))}
######what i try
doc_length <- doc_length[doc_length >= 0]
temp_frequency <- as.matrix(dtm)
freq_matrix <- data.frame(ST=colnames(temp_frequency),
Freq=colSums(temp_frequency))
json_lda <- createJSON(phi=phi,
theta=theta,
vocab=vocab,
doc.length=doc_length,
term.frequency=freq_matrix$Freq)
serVis(json_lda, out.dir='2017-08-15-complaint-vis', open.browser=FALSE)