使用不同大小的多个表合并和重塑数据/表

时间:2014-03-13 12:27:05

标签: r merge reshape tabular

我的目标是获得一个表,对于分类变量列表,返回(从最左侧列到最右侧列):分类变量名称,分类变量级别,二进制文件第一级别的频率分组变量,二进制分组变量的第二级频率,卡方检验属性,p.value和测试方法。我想要的输出示例显示在页面的最底部。当前输出和代码用于单个分类变量。我试图不把马放在马车前面。现在为单个变量获得正确的格式将是好的。我将努力让它为一个字符串做它,然后在那之后将它们组合在一起。

代码展示了迄今为止我能想到的东西。我相当肯定有一种更简单的方法可以做到这一点。我已经被告知了tables :: tabular,但是可以让我做到我想要的。我目前无法弄清楚重塑(然后如果有效,最后如何摆脱最后三列中的重复项,但我还没有)。

非常感谢使用当前代码或其他方法的任何帮助。

#make data (I couldn't get return() to work, so I used <<)
get.data<-function(){
set.seed(1)
cat1 <-sample(c(1,2), 100, replace=T)
cont1<-rnorm(100, 25, 8)
cont2<-rnorm(100, 0, 1)
cont3<-rnorm(100, 6, 14.23)
cont4<-rnorm(100, 25, 8)*runif(5, 0.1, 1)
cat2<-sample(c(1,2,3,4),100,replace=TRUE)
cat3<-sample(c(1,2,3,4,5),100,replace=TRUE)
cat4<-sample(c("Caucasian","African American", "Latino", "Multi-Racial", "No   
Response"),100,replace=TRUE)
group<-sample(c(0,1), 100, replace=T)
sex<-sample(c("male", "female"), 100, replace=T)
one  <<-data.frame(group, sex,cat1, cont1, cont2, cont3, cont4,cat2,cat3,cat4)
}

get.data()

#getting the two bits of data I would like
attach(one)
long <- (with(one, table(cat2,group)))
test<-with(one, chisq.test(cat2,group))
kk<-c(test$statistic,test$p.value,test$method)
detach(one)

#merging them together
res<-merge(as.data.frame(as.matrix(long)), as.data.frame(as.matrix(kk)),
     all=TRUE, sort=FALSE)
#unsuccessfully reshaping the data
wider <- reshape(as.data.frame(res), idvar = cat2,
     timevar = "V1", direction = "wide")

以下是来自&#39; res&#39;的输出。看起来像:

#   cat2    group   Freq    V1
#1  1   0   17  1.16345446805217
#2  2   0   11  1.16345446805217
#3  3   0   13  1.16345446805217
#4  4   0   13  1.16345446805217
#5  1   1   12  1.16345446805217
#6  2   1   13  1.16345446805217
#7  3   1   9   1.16345446805217
#8  4   1   12  1.16345446805217
#9  1   0   17  0.761782111152171
#10 2   0   11  0.761782111152171
#11 3   0   13  0.761782111152171
#12 4   0   13  0.761782111152171
#13 1   1   12  0.761782111152171
#14 2   1   13  0.761782111152171
#15 3   1   9   0.761782111152171
#16 4   1   12  0.761782111152171
#17 1   0   17  Pearson's Chi-squared test
#18 2   0   11  Pearson's Chi-squared test
#19 3   0   13  Pearson's Chi-squared test
#20 4   0   13  Pearson's Chi-squared test
#21 1   1   12  Pearson's Chi-squared test
#22 2   1   13  Pearson's Chi-squared test
#23 3   1   9   Pearson's Chi-squared test
#24 4   1   12  Pearson's Chi-squared test

这就是我想要输出的东西:

Variable     Response    Group1.Freq    Group2.Freq    Test.Stat    p.value     method
Cat2         1           17             12             1.16         0.761       Pearson's Chi...
             2           11             13
             3           13             9
             4           13             12

新问题: 我使用Ram的建议来创建一个函数,这样我就可以为多个分类变量创建一个data.frame。我想出了这段代码。但是在rbind和lapply步骤中输出混乱了。我想知道如何解决这个问题。同样,输出位于底部。

get.data<-function(){
  set.seed(1)
  cat1 <-sample(c(1,2), 100, replace=T)
  cont1<-rnorm(100, 25, 8)
  cont2<-rnorm(100, 0, 1)
  cont3<-rnorm(100, 6, 14.23)
  cont4<-rnorm(100, 25, 8)*runif(5, 0.1, 1)
  cat2<-sample(c(1,2,3,4),100,replace=TRUE)
  cat3<-sample(c(1,2,3,4,5),100,replace=TRUE)
  cat4<-sample(c("Caucasian","African American", "Latino", "Multi-Racial", "No   
     Response"),100,replace=TRUE)
  group<-sample(c(0,1), 100, replace=T)
  sex<-sample(c("male", "female"), 100, replace=T)
  one  <<-data.frame(group, sex,cat1, cont1, cont2, cont3, cont4,cat2,cat3,cat4)
}

get.data()

make.table<-function(catvars,group,data){
  attach(data)
get.chi.stuff<-function(cat, group){
  long <- table(cat,group)
  test<-chisq.test(cat,group)
  kk<-c(test$statistic,test$p.value,test$method)
  res <- data.frame(matrix(NA,nrow(long),7))
  names(res) <- c("Variable", "Response", "Group1.Freq", "Group2.Freq",
              "Test.Stat", "p.value", "method")
  res[1,1] <- deparse(substitute(cat))
  res[,2] <- row.names(long)
  res[,3:4] <- long[,1:2]
  res[1,5:7] <- kk

  return(res)
}
tables<<-do.call(rbind,lapply(data[,catvars],get.chi.stuff,group=group))

detach(data)
}
make.table(catvars=catvars,group=group, data=one)

OUTPUT(它目前没有格式化,但问题是row.names和Variable。其余的看起来很好)

row.names   Variable    Response    Group1.Freq Group2.Freq Test.Stat   p.value method
    cat2.1  X[[1L]] 1   17  12  1.16345446805217    0.761782111152171   Pearson's Chi-squared test
    cat2.2  NA  2   11  13  NA  NA  NA
    cat2.3  NA  3   13  9   NA  NA  NA
    cat2.4  NA  4   13  12  NA  NA  NA
    cat3.1  X[[2L]] 1   8   15  5.68288366946583    0.224115426983988   Pearson's Chi-squared test
 6  cat3.2  NA  2   10  7   NA  NA  NA
 7  cat3.3  NA  3   14  11  NA  NA  NA
 8  cat3.4  NA  4   8   7   NA  NA  NA
 9  cat3.5  NA  5   14  6   NA  NA  NA
 10 cat4.1  X[[3L]] African American    9   18  8.73180996607079    0.0681639164530817  Pearson's Chi-squared test
 11 cat4.2  NA  Caucasian   14  5   NA  NA  NA
 12 cat4.3  NA  Latino  6   7   NA  NA  NA
 13 cat4.4  NA  Multi-Racial    14  9   NA  NA  NA
 14 cat4.5  NA  No   
Response    11  7   NA  NA  NA
 15 sex.1   X[[4L]] female  30  17  2.74327353028067    0.0976645121155453  Pearson's Chi-squared test with Yates' continuity correction
 16 sex.2   NA  male    24  29  NA  NA  NA

1 个答案:

答案 0 :(得分:1)

由于您使用的是merge,因此会创建一个带有回收的数据框,这不是您想要的res

您已在变量reslongkk中的test中创建了所需的所有组件。因此,现在需要以您想要的特定格式将它们拼接在一起。

这不是很优雅,因为我们正在手工逐列构建所需的结果。你可以把所有这些都扔进一个函数。

res <- data.frame(matrix(NA,nrow(long),7))
names(res) <- c("Variable", "Response", "Group1.Freq", "Group2.Freq",
                  "Test.Stat", "p.value", "method")
res[1,1] <- names(attr(test$observed, "dimnames")[1])
res[,2] <- row.names(long)
res[,3:4] <- long[,1:2]
res[1,5:7] <- kk
res
#  Variable Response Group1.Freq Group2.Freq        Test.Stat
# 1     cat2        1          17          12 1.16345446805217
# 2     <NA>        2          11          13             <NA>
# 3     <NA>        3          13           9             <NA>
# 4     <NA>        4          13          12             <NA>
#            p.value                     method
# 1 0.761782111152171 Pearson's Chi-squared test
# 2              <NA>                       <NA>
# 3              <NA>                       <NA>
# 4              <NA>                       <NA>