Question

我是主题建模的新手，并且通过尝试“表征”大约98个CV来测试lda.collapsed.gibbs.sampler（）方法。我首先尝试使用语料库（因为它更容易进行过滤等）但是这给出了一些意想不到的结果 - 可能是因为lexicalize（）函数首先将其转换为只有3个文档/对象的对象

# method 1
a  <-Corpus(DirSource(doc.folder,pattern = ".txt$"), readerControl =   list(language="eng"))
a <- tm_map(a, content_transformer(removeNumbers))
a <- tm_map(a, content_transformer(removePunctuation))
a <- tm_map(a, content_transformer(stripWhitespace))
a <- tm_map(a, content_transformer(tolower))

lex <- lexicalize(a)
result = lda.collapsed.gibbs.sampler(lex$documents, 8,lex$vocab, 30, 0.1,0.1, initial = NULL, burnin = NULL, compute.log.likelihood = T)
length(a) # output: [1] 98
length(lex$documents) # output: [1] 3, even though I expect 98
dim(result$document_sums) # output: [1] 8 3  even though I expect 8 98

然而，当我直接使用cv文本作为向量时，它给出了预期的结果

# method 2
filenames = list.files(path=doc.folder,pattern=".txt$",full.names = T)

df <- data.frame(stringsAsFactors=FALSE)
for (filename in filenames){
  myfile = file(filename)
  df <- rbind(df,cbind(name=file_path_sans_ext(basename(filename)),text=paste(readLines(myfile),collapse=" "))) 
  close(myfile)
}
# the following avoids an error due to french words etc being used
df[,"text"] <- sapply(df[,"text"],iconv,"WINDOWS-1252","UTF-8")

lex <- lexicalize(df[,"text"])
result = lda.collapsed.gibbs.sampler(lex$documents, 8,lex$vocab, 30, 0.1,0.1, initial = NULL, burnin = NULL, compute.log.likelihood = T)
NROW(df) # output: [1] 98
length(lex$documents) # output: [1] 98 as expected
dim(result$document_sums) # output: [1] 8 98 as expected

在R中使用lexicalize（）和lda.collapsed.gibbs.sampler（）时出错

0 个答案: