所有
我很擅长在R中编写循环或函数,但我仍然没有真正理解如何做到这一点。目前,我需要编写一个循环/函数(不确定哪一个会更好)来执行几个不同数据帧的t测试。
我的数据类似于:
set.seed(694)
df_1_08 <- data.frame(
year = 2008,
a = runif(100, 0, 100),
b = runif(100, 0, 100),
c = runif(100, 0, 100),
d = runif(100, 0, 100)
)
df_1_09 <- data.frame(
year = 2009,
a = rnorm(100, 0, 1),
b = rnorm(100, 0, 1),
c = rnorm(100, 0, 1),
d = rnorm(100, 0, 1)
)
df_1_10 <- data.frame(
year = 2010,
a = rnorm(100, 0, 1),
b = rnorm(100, 0, 1),
c = rnorm(100, 0, 1),
d = rnorm(100, 0, 1)
)
df_2_08 <- data.frame(
year = 2008,
a = rnorm(100, 0, 1),
b = rnorm(100, 0, 1),
c = rnorm(100, 0, 1),
d = rnorm(100, 0, 1)
)
df_2_09 <- data.frame(
year = 2009,
a = rnorm(100, 0, 1),
b = rnorm(100, 0, 1),
c = rnorm(100, 0, 1),
d = rnorm(100, 0, 1)
)
df_2_10 <- data.frame(
year = 2010,
a = rnorm(100, 0, 1),
b = rnorm(100, 0, 1),
c = rnorm(100, 0, 1),
d = rnorm(100, 0, 1)
)
# Write Loop to do t-test between dfs 08, 09, 10 comparing columns a, b, c, d and storing the full results in a df
基本上,我需要对此数据执行的操作是每年对特定列运行t检验(2008,2009,2010),以便df_1_08
运行df_2_08
的t检验所有列(a
,b
,c
,d
)然后将这些列存储在数据框中(其中存储了t统计量,p值等) )。这听起来像是一个完美的循环工作。但是我也需要每年(2008年,2009年和2010年)都这样做,并将结果存储在不同的数据框中,所以这听起来像是一个完美的功能。
我不确定如何写,所以我想我在编写这些循环/函数时要求一些帮助。提前感谢您提供的任何帮助。
我还可以将数据帧组合成一个大的df,其中一列标识原始数据帧号(即df1或df2),一列标识数据帧年(即2008,2009,2010)。它看起来像这样:
df1 <- rbind(df_1_08, df_1_09, df_1_10)
df1$ID <-1
df2 <- rbind(df_1_08, df_1_09, df_1_10)
df2$ID <- 2
master.df <- rbind(df1, df2)
我不确定编写一个循环/函数来运行带有master.df
的t.tests是否更容易。在那个df中,我基本上需要在循环或函数中执行以下操作:
master.df
子集设为df1
和df2
df1
和df2
t.test
,a
,b
和c
列运行d
t.test
输出(即t统计,p值等)存储在我可以打印的data.frame中。答案 0 :(得分:2)
怎么样:
df_1_08 <- data.frame(year = 2008, a = runif(100, 0, 100), b = runif(100, 0, 100), c = runif(100, 0, 100), d = runif(100, 0, 100))
df_1_09 <- data.frame(year = 2009, a = runif(100, 0, 100), b = runif(100, 0, 100), c = runif(100, 0, 100), d = runif(100, 0, 100))
df_1_10 <- data.frame(year = 2010, a = runif(100, 0, 100), b = runif(100, 0, 100), c = runif(100, 0, 100), d = runif(100, 0, 100))
df_2_08 <- data.frame(year = 2008, a = runif(100, 0, 100), b = runif(100, 0, 100), c = runif(100, 0, 100), d = runif(100, 0, 100))
df_2_09 <- data.frame(year = 2009, a = runif(100, 0, 100), b = runif(100, 0, 100), c = runif(100, 0, 100), d = runif(100, 0, 100))
df_2_10 <- data.frame(year = 2010, a = runif(100, 0, 100), b = runif(100, 0, 100), c = runif(100, 0, 100), d = runif(100, 0, 100))
dfs_1.names <- ls()[grep("df_1", ls())]
dfs_2.names <- ls()[grep("df_2", ls())]
dfs_1.list <-lapply(dfs_1.names, get)
dfs_2.list <- lapply(dfs_2.names, get)
#in case you want to try the matrix
dfs_1.mtrx <- do.call("rbind",dfs_1.list)
dfs_2.mtrx <- do.call("rbind",dfs_2.list)
years <- intersect(unique(dfs_1.mtrx[,"year"]),unique(dfs_2.mtrx[,"year"]))
# [1] 2008 2009 2010
columns <- intersect(colnames(dfs_1.mtrx[,-1]),colnames(dfs_2.mtrx[,-1]))
# [1] "a" "b" "c" "d"
df.ttest <- as.data.frame(matrix(NA, ncol = 8, nrow = length(years)*length(columns)))
colnames(df.ttest) <- c("year","column","tstat","p.value","degreesf","low.conf","up.conf","data.name")
count = 0
for(i in 1:length(years)){
for(j in columns){
ttest <- t.test(dfs_1.list[[i]][j], dfs_2.list[[i]][j])
ttest$data.name <- paste(paste0("df_1_",years[i]-2000,"$",j),"and",
paste0("df_2_",years[i]-2000,"$",j))
count <- count + 1
df.ttest[count, "year"] <- years[i]
df.ttest[count, "column"] <- j
df.ttest[count, "tstat"] <- ttest$statistic
df.ttest[count, "p.value"] <- ttest$p.value
df.ttest[count, "degreesf"] <- ttest$parameter
df.ttest[count, "low.conf"] <- ttest$conf.int[1]
df.ttest[count, "up.conf"] <- ttest$conf.int[2]
df.ttest[count, "data.name"] <- ttest$data.name
}
}
df.ttest
看起来像:
year column tstat p.value degreesf low.conf up.conf data.name
1 2008 a 1.0607688 0.29008725 197.9914 -3.7038792 12.327117 df_1_8$a and df_2_8$a
2 2008 b 0.3311722 0.74086573 197.3689 -6.6956039 9.398291 df_1_8$b and df_2_8$b
3 2008 c 1.0410813 0.29910773 197.9405 -3.7582835 12.164152 df_1_8$c and df_2_8$c
4 2008 d 1.2623350 0.20834791 193.4532 -2.9384999 13.387911 df_1_8$d and df_2_8$d
5 2009 a -0.5764091 0.56500626 194.1686 -10.1442158 5.555762 df_1_9$a and df_2_9$a
6 2009 b -1.5222524 0.12954190 197.9248 -14.4317793 1.857603 df_1_9$b and df_2_9$b
7 2009 c -0.1744245 0.86171283 195.0217 -8.6590932 7.251902 df_1_9$c and df_2_9$c
8 2009 d 0.0839337 0.93319409 197.6654 -7.5768817 8.250526 df_1_9$d and df_2_9$d
9 2010 a 1.9125742 0.05724768 197.7406 -0.2353887 15.378495 df_1_10$a and df_2_10$a
10 2010 b 0.9024489 0.36792603 196.0224 -4.0977460 11.011904 df_1_10$b and df_2_10$b
11 2010 c -0.9735756 0.33145768 197.5899 -12.2641333 4.157135 df_1_10$c and df_2_10$c
12 2010 d 0.8721498 0.38418378 197.8601 -4.5311820 11.717207 df_1_10$d and df_2_10$d