我是主题建模的新手,并且通过尝试“表征”大约98个CV来测试lda.collapsed.gibbs.sampler()方法。我首先尝试使用语料库(因为它更容易进行过滤等)但是这给出了一些意想不到的结果 - 可能是因为lexicalize()函数首先将其转换为只有3个文档/对象的对象
# method 1
a <-Corpus(DirSource(doc.folder,pattern = ".txt$"), readerControl = list(language="eng"))
a <- tm_map(a, content_transformer(removeNumbers))
a <- tm_map(a, content_transformer(removePunctuation))
a <- tm_map(a, content_transformer(stripWhitespace))
a <- tm_map(a, content_transformer(tolower))
lex <- lexicalize(a)
result = lda.collapsed.gibbs.sampler(lex$documents, 8,lex$vocab, 30, 0.1,0.1, initial = NULL, burnin = NULL, compute.log.likelihood = T)
length(a) # output: [1] 98
length(lex$documents) # output: [1] 3, even though I expect 98
dim(result$document_sums) # output: [1] 8 3 even though I expect 8 98
然而,当我直接使用cv文本作为向量时,它给出了预期的结果
# method 2
filenames = list.files(path=doc.folder,pattern=".txt$",full.names = T)
df <- data.frame(stringsAsFactors=FALSE)
for (filename in filenames){
myfile = file(filename)
df <- rbind(df,cbind(name=file_path_sans_ext(basename(filename)),text=paste(readLines(myfile),collapse=" ")))
close(myfile)
}
# the following avoids an error due to french words etc being used
df[,"text"] <- sapply(df[,"text"],iconv,"WINDOWS-1252","UTF-8")
lex <- lexicalize(df[,"text"])
result = lda.collapsed.gibbs.sampler(lex$documents, 8,lex$vocab, 30, 0.1,0.1, initial = NULL, burnin = NULL, compute.log.likelihood = T)
NROW(df) # output: [1] 98
length(lex$documents) # output: [1] 98 as expected
dim(result$document_sums) # output: [1] 8 98 as expected