我有data.frame
:
x <- rnorm(1000, 3, 2)
groups <- rep(c("GroupA", "GroupB"), each = 500)
df <- data.frame(x, groups)
使用dplyr
,我可以抽样100行df
,然后计算GroupA
和GroupB
均值之间的差异:
df_difference_means <- df %>%
add_rownames %>%
filter(rowname %in% sample(1:1000, 100)) %>%
group_by(groups) %>%
summarise(mean.x = mean(x)) %>%
as.data.frame %>%
summarise(difference.mean.x = mean.x[2] - mean.x[1]) %>%
mutate(.replicate = 1) %>%
as.data.frame
difference.mean.x .replicate
1 -0.7258672 1
如何使用dplyr
,我可以重复此过程100次,并以data.frame
输出结果。结果data.frame应该看起来像df_difference_means_100
:
difference.mean.x <- rnorm(100, -0.72, 2)
.replicate <- 1:100
df_difference_means_100 <- data.frame(difference.mean.x, .replicate)
df_difference_means_100
difference.mean.x .replicate
1 -1.74745341 1
2 -1.60671744 2
3 -0.73216685 3
4 2.53595482 4
5 -2.13187162 5
6 0.42921334 6
7 -1.23031115 7
8 2.66900128 8
9 -0.26267355 9
10 0.97573805 10
11 4.38242693 11
12 -2.09175166 12
13 1.17403184 13
14 0.77553541 14
15 -3.61322099 15
16 1.85055915 16
17 0.06395296 17
18 -1.42459781 18
19 2.90383461 19
20 -1.79359430 20
21 -0.43856161 21
22 1.81433832 22
23 3.15741676 23
24 -1.14643453 24
25 -2.14220126 25
26 -0.32972133 26
27 -0.27037302 27
28 2.20310891 28
29 3.05937838 29
30 0.11348566 30
31 0.09080867 31
32 -2.11559132 32
33 -0.50134470 33
34 0.31628255 34
35 0.96801232 35
36 3.42165046 36
37 2.47089399 37
38 -1.34196912 38
39 -1.11181326 39
40 -3.48664556 40
41 -2.49013457 41
42 3.67952537 42
43 -3.80781570 43
44 0.68793508 44
45 0.05869912 45
46 5.25205269 46
47 -3.00920009 47
48 -2.48109066 48
49 -0.22790952 49
50 1.41952375 50
51 0.79675613 51
52 1.13585093 52
53 0.63646903 53
54 0.56779986 54
55 -1.48099201 55
56 -0.24586261 56
57 3.16075196 57
58 -0.55765459 58
59 1.78498217 59
60 3.38490948 60
61 -0.09666898 61
62 -2.38897557 62
63 -0.50976285 63
64 4.25219676 64
65 -1.57526334 65
66 0.58006652 66
67 0.89549514 67
68 -0.17842015 68
69 -2.57422568 69
70 4.14008849 70
71 -3.48424762 71
72 -3.48788857 72
73 -4.22862573 73
74 1.98098272 74
75 0.73889898 75
76 -2.78759887 76
77 -0.75359051 77
78 -0.24062074 78
79 -0.39441863 79
80 -0.58710463 80
81 -2.95208480 81
82 -0.18225793 82
83 0.98356501 83
84 0.77963590 84
85 -1.21736133 85
86 1.36733389 86
87 -0.41273956 87
88 4.58347146 88
89 0.37946472 89
90 -5.02405002 90
91 -0.09883054 91
92 -1.99874326 92
93 -0.77896124 93
94 -0.05878099 94
95 0.82023492 95
96 2.29944232 96
97 -2.24368129 97
98 1.39608682 98
99 -0.61909894 99
100 0.74170204 100
答案 0 :(得分:1)
以下是使用dplyr结合replicate
和lapply
的可行方法:
# define a custom function:
my_func <- function(df) {
df %>%
summarise(difference.mean.x = mean(x[groups == "GroupA"]) -
mean(x[groups == "GroupB"]))
}
# sample repeatedly (100 times) 100 rows of df and store in a list
# apply the custom function to each sample in the list,
# bind rows together and create an index column, all in a "pipe":
replicate(100, sample_n(df, 100), simplify = FALSE) %>%
lapply(., my_func) %>%
bind_rows %>%
mutate(replicate = 1:n())
#Source: local data frame [100 x 2]
#
# difference.mean.x replicate
#1 0.2531246 1
#2 -0.1595892 2
#3 0.1759745 3
#4 -0.1119139 4
#5 -0.1332090 5
#6 -0.8790818 6
#7 0.2170683 7
#8 -0.3484234 8
#9 0.2238635 9
#10 -0.4445486 10
#.. ... ...
答案 1 :(得分:1)
这是将所有逻辑放在单个dplyr
管道中的一种方法,但它的代价是在开始时复制df
100次:
set.seed(111)
rep_df <- lapply(1:100, function(rep) {
df[['replicate']]=rep
df
})
rep_df <- do.call(rbind, rep_df)
rep_df %>%
group_by(replicate) %>%
sample_n(100) %>%
group_by(replicate, groups) %>%
summarize(mean_x = mean(x)) %>%
summarize(mean_x_group_diff = diff(mean_x)) -> rep_df
str(rep_df)
PS。您还可以在lapply
调用中使用类似的管道。这个更紧凑,不会复制df
100次,但可能性较差:
set.seed(111)
output <- lapply(1:100, function(rep) {
sample_n(df, 100) %>%
group_by(groups) %>%
summarize(mean_x = mean(x)) %>%
summarize(mean_x_group_diff = diff(mean_x)) %>%
mutate(replicate=rep)
})
rep_df <- do.call(rbind, output)
str(rep_df)
答案 2 :(得分:0)
您可以使用
创建一个友好的dplyr公式 row_rep <- function(df, n) {
df[rep(1:nrow(df), times = n),]
}