对data.frame中每行的特定列执行t检验

时间:2012-08-07 15:32:57

标签: r dataframe

我有一个特征数据集,每行有一个特征。我想执行t.test比较数据集中两个特定列的均值,即T_Mean和A_Mean或第2列和第8列。我试图使用sapply对此进行编码,没有运气,样本数据集和下面的代码。任何修复我的代码的帮助都非常感谢!

WW_Summary <- structure(list(Trait =
  c("Morph PC1", "Morph PC2", "Morph PC3", "Morph PC4", "Colour", "Delta15N", "Delta13C"),
  T_Mean = c(-0.000369133942377988, -0.478614374395391, -0.0429785751248986, 0.141563333908087, 5.09447415329768, 7.79253141831239, -20.3678994614004),
  T_SD = c(1.25617540601557, 0.994922039584068, 0.72787412346719, 0.5683273217636, 1.85452769780342, 1.56401940841295, 2.33461396773921),
  T_N = c(615, 615, 615, 615, 561, 557, 557),
  HZ_Mean = c(0.379669406453242, 0.307293731157124, -0.0499328976749929, -0.0563021988086238, 4.74712643678161, 8.4568926056338, -20.8209771126761),
  HZ_SD = c(1.27837645625113, 1.11890593333031, 0.71490087377916, 0.699316698091669, 1.90101932622734, 1.86547215761457, 1.9590774632374),
  HZ_N = c(1137, 1137, 1137, 1137, 1131, 1136, 1136),
  A_Mean = c(-0.818704170327851, -0.104449965981942, 0.157885253051751, -0.0437302662392194, 4.31320754716981, 9.79891783567134, -19.955250501002),
  A_SD = c(1.29535566832773, 0.97478498249366, 0.678515276691309, 0.563663991917263, 1.63029422418466, 2.06376134152221, 1.47077203157055),
  A_N = c(527, 527, 527, 527, 530, 499, 499)),
  .Names = c("Trait", "T_Mean", "T_SD", "T_N", "HZ_Mean", "HZ_SD", "HZ_N", "A_Mean", "A_SD", "A_N"),
  class = "data.frame", row.names = c(NA, -7L))

## Perform t-test separately for each trait (row) comparing means for T_Mean & A_Mean (columns 2 and 8)
WW_Summary_T <- data.frame(t(sapply(WW_Summary[,c(2,8)], function(temp)
                unlist(t.test(temp, alternative = c("two.sided"))[c("statistic",
                "parameter", "p.value", "conf.int")]))))

2 个答案:

答案 0 :(得分:3)

认为你只是使用了错误的apply家庭成员。

你可以尝试一下,看看它能给你什么吗?

apply(WW_Summary[, c(2, 8)], 1, 
      function(temp) unlist(t.test(temp, aslternative = c("two.sided"))
      [c("statistic", "parameter", "p.value", "conf.int")]))

更新

@dickoa是正确的:你可能在这里做了错误的计算。不过,同样的概念适用:

data.frame(cbind(WW_Summary[1], 
         t(apply(WW_Summary[, c(2:4, 8:10)], 1, function(temp)
           unlist(
             tsum.test(mean.x = temp[[1]], s.x = temp[[2]], n.x = temp[[3]],
                       mean.y = temp[[4]], s.y = temp[[5]], n.y = temp[[6]]))
                 [c("statistic.t", "parameters.df", "p.value", 
                    "conf.int1", "conf.int2")]))))
#       Trait       statistic.t    parameters.df              p.value
# 1 Morph PC1  10.7920944667109 1102.17477516966 6.99739270551733e-26
# 2 Morph PC2 -6.40501752763609  1119.8038108643 2.20872274986877e-10
# 3 Morph PC3  -4.8221965806503 1131.93025335657 1.61345381252079e-06
# 4 Morph PC4  5.51685228304417 1116.04949237415 4.28798959831121e-08
# 5    Colour  7.40032254940697 1083.43427031755 2.71950155801888e-13
# 6  Delta15N -17.6468194524627 923.361537684413 2.79180235004071e-60
# 7  Delta13C -3.47262865160519 949.662208494884 0.000538633884372937
#            conf.int1          conf.int2
# 1  0.669552939095012  0.967117133675934
# 2  -0.48878427646537 -0.259544540361528
# 3   -0.2825915783163 -0.119136078036999
# 4  0.119393147514491  0.251194052780122
# 5  0.574117940682135  0.988415271573606
# 6  -2.22952047960588  -1.78325235511202
# 7 -0.645846712936065 -0.179451207860735

答案 1 :(得分:3)

由于您的数据已根据行进行汇总(仅限7个障碍),因此您无法使用基础R中的t.test函数。

您可以在此特殊情况下使用BSDA包和tsum.test功能

require(BSDA) ## install.package("BSDA")

ddply(WW_Summary, .(Trait), summarise,
                 stat = tsum.test(mean.x = T_Mean, s.x = T_SD, n.x = T_N, mean.y = A_Mean, s.y = A_SD, n.y = A_N)$statistic, 
                 pval = tsum.test(mean.x = T_Mean, s.x = T_SD, n.x = T_N, mean.y = A_Mean, s.y = A_SD, n.y = A_N)$p.value,
                 parameter = tsum.test(mean.x = T_Mean, s.x = T_SD, n.x = T_N, mean.y = A_Mean, s.y = A_SD, n.y = A_N)$parameters,
                 confint_lower = tsum.test(mean.x = T_Mean, s.x = T_SD, n.x = T_N, mean.y = A_Mean, s.y = A_SD, n.y = A_N)$conf.int[1],
                 confint_upper = tsum.test(mean.x = T_Mean, s.x = T_SD, n.x = T_N, mean.y = A_Mean, s.y = A_SD, n.y = A_N)$conf.int[2])


##       Trait     stat       pval parameter confint_lower
## 1    Colour   7.4003 2.7195e-13   1083.43       0.57412
## 2  Delta13C  -3.4726 5.3863e-04    949.66      -0.64585
## 3  Delta15N -17.6468 2.7918e-60    923.36      -2.22952
## 4 Morph PC1  10.7921 6.9974e-26   1102.17       0.66955
## 5 Morph PC2  -6.4050 2.2087e-10   1119.80      -0.48878
## 6 Morph PC3  -4.8222 1.6135e-06   1131.93      -0.28259
## 7 Morph PC4   5.5169 4.2880e-08   1116.05       0.11939
##   confint_upper
## 1       0.98842
## 2      -0.17945
## 3      -1.78325
## 4       0.96712
## 5      -0.25954
## 6      -0.11914
## 7       0.25119

我们还可以绘制误差条来检查平均值之间可能的重叠。

WW_Summary <- ddply(WW_Summary, .(Trait), transform, T_STDERR = T_SD/sqrt(T_N),
                                       A_STDERR = A_SD/sqrt(A_N),
                                       HZ_STDERR = HZ_SD/sqrt(HZ_N))


ggplot(WW_Summary, aes(Trait)) + geom_errorbar(aes(y = T_Mean, ymax = T_Mean + T_STDERR, ymin = T_Mean - T_STDERR, colour = "black")) + geom_errorbar(aes(y = A_Mean, ymax = A_Mean + A_STDERR, ymin = A_Mean - A_STDERR, colour = "red")) + labs(y = "Mean") + scale_colour_manual(values = c("black", "red"), label = c("T", "A"), name = "")

Error bar