我使用ks.test()比较两个向量(data_A_score,data_B_score)和另一个向量K1,
我得到这个结果:
score_ref_k1 <- c(0.09651, 0.09543, 0.09122, 0.09458, 0.09382, 0.10158, 0.10339,
0.13594, 0.09458, 0.09296)
data_A_score_src <- c(0.09293, 0.09838, 0.09866, 0.10866, 0.09726, 0.10731,
0.09866, 0.09398, 0.10007, 0.10408)
data_B_score_src <- c(0.04741, 0.0621, 0.09606, 0.08851, 0.05063, 0.39775, 0.05509,
0.10784, 0.0468, 0.04782)
ks.test(data_A_score_src, score_ref_k1, exact = FALSE, alternative = "g")
#> Warning in ks.test(data_A_score_src, score_ref_k1, exact = FALSE,
#> alternative = "g"): p-value will be approximate in the presence of ties
#>
#> Two-sample Kolmogorov-Smirnov test
#>
#> data: data_A_score_src and score_ref_k1
#> D^+ = 0.1, p-value = 0.9048
#> alternative hypothesis: the CDF of x lies above that of y
ks.test(data_B_score_src, score_ref_k1, exact = FALSE, alternative = "g")
#> Warning in ks.test(data_B_score_src, score_ref_k1, exact = FALSE,
#> alternative = "g"): p-value will be approximate in the presence of ties
#>
#> Two-sample Kolmogorov-Smirnov test
#>
#> data: data_B_score_src and score_ref_k1
#> D^+ = 0.7, p-value = 0.007447
#> alternative hypothesis: the CDF of x lies above that of y
然后我尝试用purrr::map做同样的方法。首先这是数据:
library(tidyverse)
all_comb <- structure(list(src = structure(1:2, .Label = c("data_B", "data_A"),
class = "factor"), ref = structure(c(1L, 1L), .Label = "K1", class = "factor"),
data = list(structure(list(score_ref = c(0.09651, 0.09543, 0.09122, 0.09458,
0.09382, 0.10158, 0.10339, 0.13594, 0.09458, 0.09296), score_src = c(0.04741,
0.0621, 0.09606, 0.08851, 0.05063, 0.39775, 0.05509, 0.10784, 0.0468,
0.04782)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"),
.Names = c("score_ref", "score_src")), structure(list(score_ref = c(0.09651,
0.09543, 0.09122, 0.09458, 0.09382, 0.10158, 0.10339, 0.13594, 0.09458,
0.09296), score_src = c(0.09293, 0.09838, 0.09866, 0.10866, 0.09726,
0.10731, 0.09866, 0.09398, 0.10007, 0.10408)), row.names = c(NA, -10L),
class = c("tbl_df", "tbl", "data.frame"), .Names = c("score_ref", "score_src")))),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -2L), .Names = c("src",
"ref", "data"))
# just showing the content
all_comb %>% unnest(data)
#> # A tibble: 20 × 4
#> src ref score_ref score_src
#> <fctr> <fctr> <dbl> <dbl>
#> 1 data_B K1 0.09651 0.04741
#> 2 data_B K1 0.09543 0.06210
#> 3 data_B K1 0.09122 0.09606
#> 4 data_B K1 0.09458 0.08851
#> 5 data_B K1 0.09382 0.05063
#> 6 data_B K1 0.10158 0.39775
#> 7 data_B K1 0.10339 0.05509
#> 8 data_B K1 0.13594 0.10784
#> 9 data_B K1 0.09458 0.04680
#> 10 data_B K1 0.09296 0.04782
#> 11 data_A K1 0.09651 0.09293
#> 12 data_A K1 0.09543 0.09838
#> 13 data_A K1 0.09122 0.09866
#> 14 data_A K1 0.09458 0.10866
#> 15 data_A K1 0.09382 0.09726
#> 16 data_A K1 0.10158 0.10731
#> 17 data_A K1 0.10339 0.09866
#> 18 data_A K1 0.13594 0.09398
#> 19 data_A K1 0.09458 0.10007
#> 20 data_A K1 0.09296 0.10408
然后我用这个计算相同的测试:
all_comb %>%
mutate(vt = purrr::map(data, ~ks.test(.$score_ref, .$score_src, exact=FALSE, alternative="g")),
tidied = purrr::map(vt, broom::tidy)) %>%
tidyr::unnest(tidied) %>%
select(src, ref, p.value)
给出了:
# A tibble: 2 × 3
src ref p.value
<fctr> <fctr> <dbl>
1 data_B K1 0.9048374
2 data_A K1 0.0820850
注意与最高结果的差异:
#> data: data_A_score_src and score_ref_k1
#> D^+ = 0.1, p-value = 0.9048
#> data: data_B_score_src and score_ref_k1
#> D^+ = 0.7, p-value = 0.007447
为什么?我怎样才能修复我的purrr,dplyr方法?
答案 0 :(得分:4)
如果你在ks.test中切换得分和测试,你会得到相同的值:
all_comb %>%
mutate(vt = purrr::map(data, ~ks.test(.$score_src,.$score_ref, exact=FALSE, alternative="g")),
tidied = purrr::map(vt, broom::tidy)) %>%
tidyr::unnest(tidied) %>%
select(src, ref, p.value)
# A tibble: 2 <U+00D7> 3
src ref p.value
<fctr> <fctr> <dbl>
1 bcdidsp K1 0.007446583
2 bcdipsp K1 0.904837418