摘要统计数据的节省时间

时间:2017-03-10 04:36:56

标签: r loops knitr summary xtable

目标不是交叉制表。我的目标是为数据集中的所有变量提供具有描述性统计信息(在本例中为计数和百分比)的表,这样我就不需要输入table(变量名称)约。在处理新数据集时20次。我打算用xtable + knitr导出它。不幸的是,循环有问题。任何帮助,将不胜感激。这个问题与反复需要提供摘要统计数据的人非常相关,因为这会节省大量时间。很抱歉第3次问这个问题:) 我听说过table()函数 - 事实上,我已经多次使用它了简而言之,这个函数提供了变量的名称,他们的水平,数量&百分比没有让某人在table()中为数据集中的每个变量键入 -

ESRD <- rep(c("Y", "N"), each=10)
DIABETES <- rep(c("Y", "N", "Y", "N"), c(5, 5, 5, 5))
BLAH <- rep(c("Y", "N"), each=10)
categoricalvariables <- data.frame(ESRD, DIABETES, BLAH)

descriptives <- function(VARIABLEMATRIX){
desc <- matrix(0, ncol=4, nrow=2*ncol(VARIABLEMATRIX) + ncol(VARIABLEMATRIX))
  for (i in 1:ncol(VARIABLEMATRIX)){
    matper <- matrix(0, nrow=dim(table(VARIABLEMATRIX[ ,i])), ncol=1)
    for (i in 1:dim(table(VARIABLEMATRIX[ ,i]))){
      matper[i, ] <- paste(round(prop.table(table(VARIABLEMATRIX[ ,i]))[i]*100, 2), "%")
    }

    matcount <- matrix(0, nrow=dim(table(VARIABLEMATRIX[ ,i])), ncol=1)
    for (i in 1:dim(table(VARIABLEMATRIX[ ,i]))){
      matcount[i, ] <- table(VARIABLEMATRIX[ ,i])[i]
    }

    desc[((3*i)-2), ] <- c(colnames(VARIABLEMATRIX)[i], "", "", "") 
    desc[((3*i)-1):(3*i), ] <- cbind("", names(table(VARIABLEMATRIX[ ,i])), matcount[ ,1], matper[ ,1])
return(desc)
}
}
descriptives(categoricalvariables)

我得到的输出是(显然有一个错误,但我不确定是什么问题):

     [,1]       [,2] [,3] [,4]  
 [1,] "0"        "0"  "0"  "0"   
 [2,] "0"        "0"  "0"  "0"   
 [3,] "0"        "0"  "0"  "0"   
 [4,] "DIABETES" ""   ""   ""    
 [5,] ""         "N"  "10" "50 %"
 [6,] ""         "Y"  "10" "50 %"
 [7,] "0"        "0"  "0"  "0"   
 [8,] "0"        "0"  "0"  "0"   
 [9,] "0"        "0"  "0"  "0"  

预期输出应为:

     [,1]       [,2] [,3] [,4]  
 [1,] "ESRD"     ""   ""   ""     
 [2,] ""         "N"  "10" "50 %" 
 [3,] ""         "Y"  "10" "50 %"   
 [4,] "DIABETES" ""   ""   ""    
 [5,] ""         "N"  "10" "50 %"
 [6,] ""         "Y"  "10" "50 %"
 [7,] "BLAH"     ""   ""   ""     
 [8,] ""         "N"  "10" "50 %"  
 [9,] ""         "Y"  "10" "50 %"

2 个答案:

答案 0 :(得分:0)

以下是使用tidyverse函数的一些选项:

library(tidyverse)

categoricalvariables %>% 
  gather(Measure, Value) %>%
  group_by(Measure, Value) %>%
  tally %>%
  mutate(Percent=n/sum(n))
   Measure Value     n Percent
1     BLAH     N    10     0.5
2     BLAH     Y    10     0.5
3 DIABETES     N    10     0.5
4 DIABETES     Y    10     0.5
5     ESRD     N    10     0.5
6     ESRD     Y    10     0.5
categoricalvariables %>% 
  gather(Measure, Value) %>%
  group_by(Measure, Value) %>%
  tally %>%
  mutate(Percent=n/sum(n)) %>%
  gather(Stats, Value2, -Measure, -Value) %>%
  unite(Value_Stats, Stats, Value) %>%
  spread(Value_Stats, Value2)
   Measure   n_N   n_Y Percent_N Percent_Y
1     BLAH    10    10       0.5       0.5
2 DIABETES    10    10       0.5       0.5
3     ESRD    10    10       0.5       0.5

我已将数据保留为数字格式,以备您进行进一步处理时使用。

要为导出的表进行设置,可能是这样的:

tab = categoricalvariables %>% 
  gather(Measure, Value) %>%
  group_by(Measure, Value) %>%
  summarise(Count=n()) %>%
  mutate(Percent=paste0(sprintf("%1.1f", Count/sum(Count)*100),"%")) %>%
  ungroup %>%
  mutate(Measure = ifelse(duplicated(Measure),"", Measure))
   Measure Value Count Percent
1     BLAH     N    10   50.0%
2              Y    10   50.0%
3 DIABETES     N    10   50.0%
4              Y    10   50.0%
5     ESRD     N    10   50.0%
6              Y    10   50.0%

现在,您可以在xtable上运行tab

library(xtable)

print(xtable(tab, align="llcrr"), include.rownames=FALSE)

rmarkdown文档输出到PDF时

这样看起来像这样

enter image description here

如果您有要汇总的数字列,则可以这样做(例如(使用内置的iris数据框):

iris %>% group_by(Species) %>%
  summarise_all(funs(mean, min, max)) %>%
  gather(key, value, -Species) %>%
  separate(key, c("Measure","Stat"),"_") %>%
  spread(Stat, value)

你可能想要进一步重塑这个或重新格式化输出到表格,但它可以让你知道什么是可能的。

      Species      Measure   max  mean   min
1      setosa Petal.Length   1.9 1.462   1.0
2      setosa  Petal.Width   0.6 0.246   0.1
3      setosa Sepal.Length   5.8 5.006   4.3
4      setosa  Sepal.Width   4.4 3.428   2.3
5  versicolor Petal.Length   5.1 4.260   3.0
6  versicolor  Petal.Width   1.8 1.326   1.0
7  versicolor Sepal.Length   7.0 5.936   4.9
8  versicolor  Sepal.Width   3.4 2.770   2.0
9   virginica Petal.Length   6.9 5.552   4.5
10  virginica  Petal.Width   2.5 2.026   1.4
11  virginica Sepal.Length   7.9 6.588   4.9
12  virginica  Sepal.Width   3.8 2.974   2.2

答案 1 :(得分:0)

oway_tables包中的descriptr函数创建多个单向表。以下是您的示例的输出:

> ESRD <- rep(c("Y", "N"), each=10)
> DIABETES <- rep(c("Y", "N", "Y", "N"), c(5, 5, 5, 5))
> BLAH <- rep(c("Y", "N"), each=10)
> categoricalvariables <- data.frame(ESRD, DIABETES, BLAH)
> descriptr::oway_tables(categoricalvariables)

                           Variable: ESRD                                
|--------------------------------------------------------------------------|
|                                Cumulative                    Cumulative  |
|    Levels    |  Frequency   |   Frequency  |   Percent    |    Percent   |
|--------------------------------------------------------------------------|
|       N      |      10      |      10      |      50      |      50      |
|--------------------------------------------------------------------------|
|       Y      |      10      |      20      |      50      |      100     |
|--------------------------------------------------------------------------|


                         Variable: DIABETES                              
|--------------------------------------------------------------------------|
|                                Cumulative                    Cumulative  |
|    Levels    |  Frequency   |   Frequency  |   Percent    |    Percent   |
|--------------------------------------------------------------------------|
|       N      |      10      |      10      |      50      |      50      |
|--------------------------------------------------------------------------|
|       Y      |      10      |      20      |      50      |      100     |
|--------------------------------------------------------------------------|


                           Variable: BLAH                                
|--------------------------------------------------------------------------|
|                                Cumulative                    Cumulative  |
|    Levels    |  Frequency   |   Frequency  |   Percent    |    Percent   |
|--------------------------------------------------------------------------|
|       N      |      10      |      10      |      50      |      50      |
|--------------------------------------------------------------------------|
|       Y      |      10      |      20      |      50      |      100     |
|--------------------------------------------------------------------------|

链接到该功能的文档:oway_tables