我想在Shiny App
的系统中上传文本文件夹,以便从Corpus获取Document Term Matrix
以申请K-means
。
我尝试了各种方法来做到这一点,但我无法在所有上传的文件之间建立连接来创建语料库
我可以通过在全球环境中创建语料库来应用K-means,但我希望通过上传文件夹或选择多个文件来通过ShinyApp执行此操作。
以下是我到目前为止所做的代码:
library(shiny)
library(shinydashboard)
library(shinythemes)
library(shinyFiles)
library(tm)
ui <- dashboardPage(
dashboardHeader(title = "Document_Clustering"),
dashboardSidebar(
sidebarMenu(
menuItem("Data Processing", tabName = "DP", icon = icon("info-circle")),
menuItem("K-Means", tabName = "KMeans", icon = icon("th"))
)),
dashboardBody(
tabItems(
tabItem(tabName = "DP",
fluidRow(
box(fileInput('file1', 'Choose Files',
accept=c('text/csv',
'text/comma-separated-values,text/plain',
'.csv'), multiple = TRUE)
, solidHeader = TRUE))
,fluidRow(
box(title = "Pre-processing", width = 15 ,tableOutput('proc'))
)
),
tabItem(tabName = "KMeans",
fluidRow(
box(
title = "Enter Number of Clusters:",
selectInput("C", choices =c(seq(1 , 15, 1)),label = NULL ,selected = 1), solidHeader = TRUE
)),
fluidRow(box(title = "Cluster", width = 9, textOutput("cluster1"))),
fluidRow(box(title = "Cluster Size", width = 9, textOutput("size1"))),
fluidRow(box(title= "Between Cluster Hetrogeneity" , width=9, textOutput("hetro1")))
)
)))
server <- shinyServer(function(input, output, session){
myData <- reactive({
inFile <- input$file1
if (is.null(inFile)) return(NULL)
con<- file(inFile$datapath, open="rt", encoding = "UTF-8")
text<-readLines(con)
msg<- paste(text, collapse = "\n")
close(con)
msg<- msg
myCorpus <- Corpus(VectorSource(msg))
myCorpus <- tm_map(myCorpus, tolower)
myCorpus <- tm_map(myCorpus, PlainTextDocument)
myCorpus<- tm_map(myCorpus,removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
myCorpus <- tm_map(myCorpus, removeWords,stopwords("english"))
myCorpus <- tm_map(myCorpus, stripWhitespace)
dtm <- DocumentTermMatrix(myCorpus,control = list(minWordLength = 1))
dtm_tfxidf <- weightTfIdf(dtm)
m11 <- as.matrix(dtm_tfxidf)
ri <- m11
set.seed(1234)
### Only kmeans
n2 <- input$C
clusk <- kmeans(as.data.frame(ri), n2) #, nstart = 9)
T3<- list(Name= m11, Cluster_K=clusk$cluster, Size_K= clusk$size, Hetro_K=clusk$betweenss/clusk$totss*100)
})
output$proc <- renderTable({
myData()$Name
})
output$cluster1 <- renderText({
myData()$Cluster_K
})
output$size1 <- renderText({
myData()$Size_K
})
output$hetro1 <- renderText({
myData()$Hetro_K
})
})
shinyApp(ui= ui, server = server)
使用上面的代码我可以上传多个文件,但是我的进一步处理会出错。 错误:我无法解决的“描述”参数无效
此外,当我只上传单个文件时,一切似乎都有效,但我没有理解单个文件kmeans
中群集大小为2的原因。
非常感谢任何形式的帮助。
提前致谢!
答案 0 :(得分:0)
我们无法在不使用某些功能的情况下连接所有文件,并且我的代码中缺少该功能。
要使ShinyApp
正常工作,请在服务器部分中进行以下更改:
替换
con<- file(inFile$datapath, open="rt", encoding = "UTF-8")
text<-readLines(con)
msg<- paste(text, collapse = "\n")
close(con)
msg<- msg
myCorpus <- Corpus(VectorSource(msg))
有了这个
get.msg <- function(path)
{
con <- file(path, open = "rt", encoding = "latin1")
text <- readLines(con)
msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
close(con)
return(paste(msg, collapse = "\n"))
}
data.docs <- inFile$datapath
data.docs <- data.docs[which(data.docs != "cmds")]
all.data <- sapply(data.docs,
function(p) get.msg(file.path(p)))
myCorpus <- Corpus(VectorSource(all.data))