使用dplyr重复采样data.frame

时间:2015-02-20 12:44:32

标签: r vector dataframe dplyr

我有data.frame

x <- rnorm(1000, 3, 2)
groups <- rep(c("GroupA", "GroupB"), each = 500)
df <- data.frame(x, groups)

使用dplyr,我可以抽样100行df,然后计算GroupAGroupB均值之间的差异:

df_difference_means <- df %>%
add_rownames %>%
filter(rowname %in% sample(1:1000, 100)) %>%
group_by(groups) %>%
summarise(mean.x = mean(x)) %>%
as.data.frame %>%
summarise(difference.mean.x = mean.x[2] - mean.x[1]) %>%
mutate(.replicate = 1) %>%
as.data.frame

  difference.mean.x .replicate
1        -0.7258672          1

如何使用dplyr,我可以重复此过程100次,并以data.frame输出结果。结果data.frame应该看起来像df_difference_means_100

difference.mean.x <- rnorm(100, -0.72, 2)
.replicate <- 1:100
df_difference_means_100 <- data.frame(difference.mean.x, .replicate)

df_difference_means_100

    difference.mean.x .replicate
1         -1.74745341          1
2         -1.60671744          2
3         -0.73216685          3
4          2.53595482          4
5         -2.13187162          5
6          0.42921334          6
7         -1.23031115          7
8          2.66900128          8
9         -0.26267355          9
10         0.97573805         10
11         4.38242693         11
12        -2.09175166         12
13         1.17403184         13
14         0.77553541         14
15        -3.61322099         15
16         1.85055915         16
17         0.06395296         17
18        -1.42459781         18
19         2.90383461         19
20        -1.79359430         20
21        -0.43856161         21
22         1.81433832         22
23         3.15741676         23
24        -1.14643453         24
25        -2.14220126         25
26        -0.32972133         26
27        -0.27037302         27
28         2.20310891         28
29         3.05937838         29
30         0.11348566         30
31         0.09080867         31
32        -2.11559132         32
33        -0.50134470         33
34         0.31628255         34
35         0.96801232         35
36         3.42165046         36
37         2.47089399         37
38        -1.34196912         38
39        -1.11181326         39
40        -3.48664556         40
41        -2.49013457         41
42         3.67952537         42
43        -3.80781570         43
44         0.68793508         44
45         0.05869912         45
46         5.25205269         46
47        -3.00920009         47
48        -2.48109066         48
49        -0.22790952         49
50         1.41952375         50
51         0.79675613         51
52         1.13585093         52
53         0.63646903         53
54         0.56779986         54
55        -1.48099201         55
56        -0.24586261         56
57         3.16075196         57
58        -0.55765459         58
59         1.78498217         59
60         3.38490948         60
61        -0.09666898         61
62        -2.38897557         62
63        -0.50976285         63
64         4.25219676         64
65        -1.57526334         65
66         0.58006652         66
67         0.89549514         67
68        -0.17842015         68
69        -2.57422568         69
70         4.14008849         70
71        -3.48424762         71
72        -3.48788857         72
73        -4.22862573         73
74         1.98098272         74
75         0.73889898         75
76        -2.78759887         76
77        -0.75359051         77
78        -0.24062074         78
79        -0.39441863         79
80        -0.58710463         80
81        -2.95208480         81
82        -0.18225793         82
83         0.98356501         83
84         0.77963590         84
85        -1.21736133         85
86         1.36733389         86
87        -0.41273956         87
88         4.58347146         88
89         0.37946472         89
90        -5.02405002         90
91        -0.09883054         91
92        -1.99874326         92
93        -0.77896124         93
94        -0.05878099         94
95         0.82023492         95
96         2.29944232         96
97        -2.24368129         97
98         1.39608682         98
99        -0.61909894         99
100        0.74170204        100

3 个答案:

答案 0 :(得分:1)

以下是使用dplyr结合replicatelapply的可行方法:

# define a custom function:
my_func <- function(df) {
  df %>% 
    summarise(difference.mean.x = mean(x[groups == "GroupA"]) - 
                                       mean(x[groups == "GroupB"]))
} 

# sample repeatedly (100 times) 100 rows of df and store in a list
# apply the custom function to each sample in the list,
# bind rows together and create an index column, all in a "pipe":

replicate(100, sample_n(df, 100), simplify = FALSE) %>%
  lapply(., my_func) %>% 
  bind_rows %>%
  mutate(replicate = 1:n())

#Source: local data frame [100 x 2]
#
#   difference.mean.x replicate
#1          0.2531246         1
#2         -0.1595892         2
#3          0.1759745         3
#4         -0.1119139         4
#5         -0.1332090         5
#6         -0.8790818         6
#7          0.2170683         7
#8         -0.3484234         8
#9          0.2238635         9
#10        -0.4445486        10
#..               ...       ...

答案 1 :(得分:1)

这是将所有逻辑放在单个dplyr管道中的一种方法,但它的代价是在开始时复制df 100次:

set.seed(111)
rep_df <- lapply(1:100, function(rep) {
    df[['replicate']]=rep
    df
})
rep_df <- do.call(rbind, rep_df)

rep_df %>%
    group_by(replicate) %>%
    sample_n(100) %>%
    group_by(replicate, groups) %>%
    summarize(mean_x = mean(x)) %>%
    summarize(mean_x_group_diff = diff(mean_x)) -> rep_df
str(rep_df)

PS。您还可以在lapply调用中使用类似的管道。这个更紧凑,不会复制df 100次,但可能性较差:

set.seed(111)
output <- lapply(1:100, function(rep) {
    sample_n(df, 100) %>%
    group_by(groups) %>%
    summarize(mean_x = mean(x)) %>%
    summarize(mean_x_group_diff = diff(mean_x)) %>%
    mutate(replicate=rep)
})
rep_df <- do.call(rbind, output)
str(rep_df)

答案 2 :(得分:0)

您可以使用

创建一个友好的dplyr公式

row_rep <- function(df, n) { df[rep(1:nrow(df), times = n),] }

来自https://gist.github.com/mdlincoln/528a53939538b07ade86