假设这样的数据结构:
ID testA_wave1 testA_wave2 testA_wave3 testB_wave1 testB_wave2 testB_wave3
1 1 3 2 3 6 5 3
2 2 4 4 4 3 6 6
3 3 10 2 1 4 4 4
4 4 5 3 12 2 7 4
5 5 5 3 9 2 4 2
6 6 10 0 2 6 6 5
7 7 6 8 4 6 8 3
8 8 1 5 4 5 6 0
9 9 3 2 7 8 4 4
10 10 4 9 5 11 8 8
我想要实现的是分别为每个测试计算一个配对的t检验(在这种情况下,这意味着testA和testB,但实际上,我有更多的测试)。我想这样做,以便将给定测试的第一波与同一测试的所有其他后续波进行比较(在testA的情况下,这意味着testA_wave1与testA_wave2以及testA_wave1与testA_wave3)。
这样,我能够实现它:
df %>%
gather(variable, value, -ID) %>%
mutate(wave_ID = paste0("wave", parse_number(variable)),
variable = ifelse(grepl("testA", variable), "testA",
ifelse(grepl("testB", variable), "testB", NA_character_))) %>%
group_by(wave_ID, variable) %>%
summarise(value = list(value)) %>%
spread(wave_ID, value) %>%
group_by(variable) %>%
mutate(p_value_w1w2 = t.test(unlist(wave1), unlist(wave2), paired = TRUE)$p.value,
p_value_w1w3 = t.test(unlist(wave1), unlist(wave3), paired = TRUE)$p.value) %>%
select(variable, matches("(p_value)"))
variable p_value_w1w2 p_value_w1w3
<chr> <dbl> <dbl>
1 testA 0.664 0.921
2 testB 0.146 0.418
但是,我希望看到不同/更优雅的解决方案,它们给出相似的结果。我主要是在寻找dplyr
/ tidyverse
解决方案,但是如果有一种完全不同的方法来实现它,我并不反对。
样本数据:
set.seed(123)
df <- data.frame(ID = 1:20,
testA_wave1 = round(rnorm(20, 5, 3), 0),
testA_wave2 = round(rnorm(20, 5, 3), 0),
testA_wave3 = round(rnorm(20, 5, 3), 0),
testB_wave1 = round(rnorm(20, 5, 3), 0),
testB_wave2 = round(rnorm(20, 5, 3), 0),
testB_wave3 = round(rnorm(20, 5, 3), 0))
答案 0 :(得分:10)
自dplyr
0.8.0起,我们可以使用group_split
将数据帧拆分为数据帧列表。
我们gather
将数据帧转换为长格式,然后将separate
列的名称key
分为不同的列(test
和{{1} }。然后,我们使用wave
将数据框基于group_split
列拆分为列表。对于列表中的每个数据帧,我们将其test
转换为宽格式,然后计算spread
的值,然后使用t.test
将其重新绑定为一个数据帧。
map_dfr
我们手动执行上述t检验,因为仅需要计算两个值。如果library(tidyverse)
df %>%
gather(key, value, -ID) %>%
separate(key, c("test", "wave")) %>%
group_split(test) %>% #Previously we had to do split(.$test) here
map_dfr(. %>%
spread(wave, value) %>%
summarise(test = first(test),
p_value_w1w2 = t.test(wave1, wave2, paired = TRUE)$p.value,
p_value_w1w3 = t.test(wave1, wave3, paired = TRUE)$p.value))
# A tibble: 2 x 3
# test p_value_w1w2 p_value_w1w3
# <chr> <dbl> <dbl>
#1 testA 0.664 0.921
#2 testB 0.146 0.418
列的数量更多,则可能变得很麻烦。在这种情况下,我们可以做到
wave...
此处将对每个带有“ wave1”列的“ wave ..”列执行t检验。
由于您也可以使用其他解决方案,因此请尝试使用纯基础R解决方案
df %>%
gather(key, value, -ID) %>%
separate(key, c("test", "wave")) %>%
group_split(test) %>%
map_dfr(function(data)
data %>%
spread(wave, value) %>%
summarise_at(vars(setdiff(unique(data$wave), "wave1")),
function(x) t.test(.$wave1, x, paired = TRUE)$p.value) %>%
mutate(test = first(data$test)))
# wave2 wave3 test
# <dbl> <dbl> <chr>
#1 0.664 0.921 testA
#2 0.146 0.418 testB
我们根据sapply(split.default(df[-1], sub("_.*", "", names(df[-1]))), function(x)
c(p_value_w1w2 = t.test(x[[1]], x[[2]],paired = TRUE)$p.value,
p_value_w1w3 = t.test(x[[1]], x[[3]],paired = TRUE)$p.value))
# testA testB
#p_value_w1w2 0.6642769 0.1456059
#p_value_w1w3 0.9209554 0.4184603
拆分列,并创建一个数据框列表,并将test*
应用于每个数据框的不同列组合。
答案 1 :(得分:5)
这是一种使用purrr
的方法。
library("tidyverse")
set.seed(123)
df <- tibble(
ID = 1:20,
testA_wave1 = round(rnorm(20, 5, 3), 0),
testA_wave2 = round(rnorm(20, 5, 3), 0),
testA_wave3 = round(rnorm(20, 5, 3), 0),
testB_wave1 = round(rnorm(20, 5, 3), 0),
testB_wave2 = round(rnorm(20, 5, 3), 0),
testB_wave3 = round(rnorm(20, 5, 3), 0)
)
pvalues <- df %>%
# From wide tibble to long tibble
gather(test, value, -ID) %>%
separate(test, c("test", "wave")) %>%
# Not stricly necessary; will order the waves alphabetically instead
mutate(wave = parse_number(wave)) %>%
inner_join(., ., by = c("ID", "test")) %>%
# If there are two waves w1 and w2,
# we end up with pairs (w1, w1), (w1, w2), (w2, w1) and (w2, w2),
# so filter out to keep the pairing (w1, w2) only
filter(wave.x == 1, wave.x < wave.y) %>%
nest(ID, value.x, value.y) %>%
mutate(pvalue = data %>%
# Perform the test
map(~t.test(.$value.x, .$value.y, paired = TRUE)) %>%
map(broom::tidy) %>%
# Also not strictly necessary; you might want to keep all
# information about the test: estimate, statistic, etc.
map_dbl(pluck, "p.value"))
pvalues
#> # A tibble: 4 x 5
#> test wave.x wave.y data pvalue
#> <chr> <dbl> <dbl> <list> <dbl>
#> 1 testA 1 2 <tibble [20 x 3]> 0.664
#> 2 testA 1 3 <tibble [20 x 3]> 0.921
#> 3 testB 1 2 <tibble [20 x 3]> 0.146
#> 4 testB 1 3 <tibble [20 x 3]> 0.418
pvalues %>%
# Drop the data in order to pivot the table
select(- data) %>%
unite("waves", wave.x, wave.y, sep = ":") %>%
spread(waves, pvalue)
#> # A tibble: 2 x 3
#> test `1:2` `1:3`
#> <chr> <dbl> <dbl>
#> 1 testA 0.664 0.921
#> 2 testB 0.146 0.418
由reprex package(v0.2.1)于2019-03-08创建
答案 2 :(得分:3)
要提出data.table
解决方案:
library(stringr)
library(data.table)
library(magrittr) ## for the pipe operator
dt_sol <- function(df) {
## create patterns for the melt operation:
## all columns from the same wave should go in one column
grps <- str_extract(names(df)[-1],
"[0-9]+$") %>%
unique() %>%
paste0("wave", ., "$")
grp_names <- sub("\\$", "", grps)
## melt the data table: all test*_wave_i data go into column wave_i
df.m <- melt(df,
measure = patterns(grps),
value.name = grp_names,
variable.name = "test")
## define the names for the new column, we want to extract estimate and p.value
new_cols <- c(outer(c("p.value", "estimate"),
grp_names[-1],
paste, sep = "_"))
## use lapply on .SD which equals to all wave_i columns but the first one
## return estimate and p.value
df.m[,
setNames(unlist(lapply(.SD,
function(col) {
t.test(wave1, col, paired = TRUE)[c("p.value", "estimate")]
}), recursive = FALSE), new_cols),
test, ## group by each test
.SDcols = grp_names[-1]]
}
dt <- copy(df)
setDT(dt)
dt_sol(dt)
# test p.value_wave2 estimate_wave2 p.value_wave3 estimate_wave3
# 1: 1 0.6642769 0.40 0.9209554 -0.1
# 2: 2 0.1456059 -1.45 0.4184603 0.7
基准
将data.table
解决方案与tidyverse
解决方案进行比较,我们得出data.table
解决方案的速度提高了3倍:
dp_sol <- function(df) {
df %>%
gather(test, value, -ID) %>%
separate(test, c("test", "wave")) %>%
inner_join(., ., by = c("ID", "test")) %>%
filter(wave.x == 1, wave.x < wave.y) %>%
nest(ID, value.x, value.y) %>%
mutate(pvalue = data %>%
map(~t.test(.$value.x, .$value.y, paired = TRUE)) %>%
map(broom::tidy) %>%
map_dbl(pluck, "p.value"))
}
library(microbenchmark)
microbenchmark(dplyr = dp_sol(df),
data.table = dt_sol(dt))
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# dplyr 6.119273 6.897456 7.639569 7.348364 7.996607 14.938182 100 b
# data.table 1.902547 2.307395 2.790910 2.758789 3.133091 4.923153 100 a
输入稍大:
make_df <- function(nr_tests = 2,
nr_waves = 3,
n_per_wave = 20) {
mat <- cbind(seq(1, n_per_wave),
matrix(round(rnorm(nr_tests * nr_waves * n_per_wave), 0),
nrow = n_per_wave))
c_names <- c(outer(1:nr_waves, 1:nr_tests, function(w, t) glue::glue("test{t}_wave{w}")))
colnames(mat) <- c("ID", c_names)
as.data.frame(mat)
}
df2 <- make_df(100, 100, 10)
dt2 <- copy(df2)
setDT(dt2)
microbenchmark(dplyr = dp_sol(df2),
data.table = dt_sol(dt2)
# Unit: seconds
# expr min lq mean median uq max neval cld
# dplyr 3.469837 3.669819 3.877548 3.821475 3.984518 5.268596 100 b
# data.table 1.018939 1.126244 1.193548 1.173175 1.252855 1.743075 100 a
答案 3 :(得分:1)
要将另一种更简洁的app = App()
app.mainloop()
解决方案引入混合,在这种混合中,我们将数据融合为长格式:
data.table