R-使用数据帧中的所有非零分数作为用sample()替换自身的概率

时间:2018-07-30 10:59:55

标签: r probability

我有一个叫tibble的R my_data,它由(1)个零或(2)个介于零和一之间的数字组成:

> my_data
# A tibble: 30 x 40
      s1    s2    s3    s4    s5    s6    s7    s8    s9   s10   s11   s12   s13
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1     0     0     0     0     0     0 0.969     0     0     0     0     0     0
 2     0     0     0     0     0     0 0         0     0     0     0     0     0
 3     0     0     0     0     0     0 0         0     0     0     0     0     0
 4     0     0     0     0     0     0 0         0     0     0     0     0     0
 5     0     0     0     0     0     0 0         0     0     0     0     0     0
 6     0     0     0     0     0     0 0         0     0     0     0     0     0
 7     0     0     0     0     0     0 0         0     0     0     0     0     0
 8     0     0     0     0     0     0 0         0     0     0     0     0     0
 9     0     0     0     0     0     0 0         0     0     0     0     0     0
10     0     0     0     0     0     0 0         0     0     0     0     0     0
# ... with 20 more rows, and 27 more variables: s14 <dbl>, s15 <dbl>, s16 <dbl>,
#   s17 <dbl>, s18 <dbl>, s19 <dbl>, s20 <dbl>, s21 <dbl>, s22 <dbl>, s23 <dbl>,
#   s24 <dbl>, s25 <dbl>, s26 <dbl>, s27 <dbl>, s28 <dbl>, s29 <dbl>, s30 <dbl>,
#   s31 <dbl>, s32 <dbl>, s33 <dbl>, s34 <dbl>, s35 <dbl>, s36 <dbl>, s37 <dbl>,
#   s38 <dbl>, s39 <dbl>, s40 <dbl>

我想以一定的概率将my_data中所有非零数字(例如s7中的0.969)替换为1,而数字本身是概率,否则它们将替换为0。例如,数字 0.969(在名为s7的列中)被替换为1的概率为0.969,而将其替换为0的概率为0.031。

我尝试了这个,但是不起作用:

# Doesn't work:
my_data %>% 
    mutate_all(function(x) {
        case_when(x == 0 ~ 0, 
                  x > 0 ~ sample(0:1, size = 1, prob = c(1 - x, x)))
    })

我该怎么做?我应该使用purrr::map()(如何?)或其他方式?谢谢!

这是dput()中的my_data

structure(list(s1 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), s2 = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0.956159271283707, 0), s3 = c(0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.982878394164842, 
0, 0, 0, 0, 0.982878394164842), s4 = c(0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0.959674748019852, 0, 0, 0, 0, 0, 0, 0, 
0, 0.959674748019852, 0, 0, 0, 0, 0), s5 = c(0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.876892497722416, 
0, 0, 0, 0, 0, 0.876892497722416, 0, 0), s6 = c(0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0.989641778880238, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0.989641778880238, 0, 0, 0, 0, 0), s7 = c(0.969355168732184, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.969355168732184, 
0, 0, 0, 0, 0, 0, 0, 0), s8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0.991517098892877, 0, 0, 0, 0, 0, 0, 0, 0, 0.991517098892877, 
0, 0, 0, 0.991517098892877, 0.991517098892877), s9 = c(0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.929770500656618, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.929770500656618, 0, 0), s10 = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0), s11 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.993637560789263, 
0), s12 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0.949340969426271, 0, 0, 0.949340969426271, 
0, 0), s13 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.919919910704918, 0), s14 = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.937896138681889, 
0, 0, 0.937896138681889, 0, 0, 0, 0, 0), s15 = c(0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0.991517098892877, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.991517098892877), s16 = c(0.956159271283707, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.956159271283707, 0, 0, 
0, 0, 0, 0, 0, 0.956159271283707, 0, 0, 0, 0, 0, 0, 0.956159271283707, 
0.956159271283707), s17 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0.597187792371775, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0.597187792371775, 0), s18 = c(0.975209130375021, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.975209130375021, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.975209130375021), s19 = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0), s20 = c(0.937234650859115, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.937234650859115, 0, 0, 0, 
0, 0, 0.937234650859115, 0, 0, 0, 0, 0, 0, 0, 0), s21 = c(0.929770500656618, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.929770500656618, 
0, 0, 0, 0, 0, 0.929770500656618, 0, 0, 0, 0, 0, 0, 0.929770500656618, 
0), s22 = c(0.929770500656618, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.929770500656618, 0, 0, 0, 0, 
0, 0, 0, 0.929770500656618), s23 = c(0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0.921445826350068), s24 = c(0.919919910704918, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0), s25 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.993637560789263, 0), 
    s26 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.942968974602997, 0.942968974602997
    ), s27 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.959674748019852, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), s28 = c(0.999498946154851, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0), s29 = c(0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0.988925875658174, 0), s30 = c(0, 0.975209130375021, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0.975209130375021, 0), s31 = c(0.986350500013957, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.986350500013957, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.986350500013957
    ), s32 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.942968974602997, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.942968974602997), 
    s33 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.927760110879459, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), s34 = c(0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.919919910704918, 
    0, 0, 0, 0, 0, 0, 0, 0, 0.919919910704918, 0, 0, 0.919919910704918, 
    0, 0), s35 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.956159271283707, 
    0, 0, 0, 0, 0, 0, 0, 0.956159271283707, 0, 0, 0, 0, 0, 0, 
    0.956159271283707, 0), s36 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0.991517098892877, 0, 0.991517098892877, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0.991517098892877, 0.991517098892877, 
    0), s37 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0.919919910704918, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.919919910704918, 
    0), s38 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), s39 = c(0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0.999972102622884, 0, 0, 0, 0, 0), s40 = c(0.942968974602997, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0.942968974602997, 0, 0, 0, 0, 0, 0, 0, 0)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -30L))

4 个答案:

答案 0 :(得分:3)

您正在尝试从二项分布中进行抽样。幸运的是,rbinom关于其prob参数是向量化的,您可以避免任何R循环(forapplyVectorize等)。

m <- as.matrix(DF)
set.seed(42) #for reproducibility
m[m != 0] <- rbinom(sum(m != 0), 1, m[m != 0])

答案 1 :(得分:2)

您可以尝试:

library(tidyverse)
as_tibble(apply(df, c(1,2), function(x) sample(c(0,1),1,prob=c(1-x,x))))

通常不建议从矩阵转换为data.frame,但是在这里看来您确实有一个格式化为data.frame的矩阵,所以我去了。

为避免转换,您可以执行以下操作:

df %>% mutate_all(~ map_dbl(.,~sample(c(0,1),1,prob=c(1-.x,.x))))

以下内容将在采样之前测试该值,但是我不确定它会更快还是更快:

df %>% mutate_all(~ map_if(.,~. != 0, ~sample(c(0,1),1,prob=c(1-.x,.x))) %>% unlist)

答案 2 :(得分:2)

我会使用runif:

df %>%
  map_df(~ if_else(runif(length(.x)) < .x, 1, 0))

答案 3 :(得分:1)

如果您确实想使用自定义功能(通过case_when使用)

df %>% 
  rowwise() %>%
  mutate_all(function(x) {
    case_when(x == 0 ~ 0L, 
              x > 0 ~ sample(0:1, size = 1, prob = c(1 - x, x)))
  })

f = function(x) {
  case_when(x == 0 ~ 0L, 
            x > 0 ~ sample(0:1, size = 1, prob = c(1 - x, x))) }
f = Vectorize(f)

df %>% mutate_all(f)

您的方法遇到两个问题。

1)您的函数未向量化,正在考虑概率的整个列。错误是Error in mutate_impl(.data, dots) : Evaluation error: incorrect number of probabilities.,使用rowwise或向量化函数可以解决此问题。

2)case_when没有返回相同类型的值。错误为Error in mutate_impl(.data, dots) : Evaluation error: must be type double, not integer.使用由0L插入的0将解决此问题。