如何使用left_join和嵌套在R中计算不同类别的均值?

时间:2017-01-19 21:44:36

标签: r left-join dplyr tidyr

我正在尝试使用left_joinnest计算分箱数据的平均值。

bin.size = 100 

第一个数据帧:

df = data.frame(x =c(300,400), 
                y = c("sca1","sca2"))
    x    y
1 300 sca1
2 400 sca2

第二个数据帧:

df2 = data.frame(snp = c(1,2,10,100,1,2,14,16,399), 
                 sca = c("sca1","sca1","sca1","sca1","sca2","sca2","sca2","sca2","sca2"))

      snp   r2  sca
1   1 0.70 sca1
2   2 0.80 sca1
3  10 0.70 sca1
4 100 0.10 sca1
5   1 0.90 sca2
6   2 0.98 sca2
7  14 0.80 sca2
8  16 0.80 sca2
9 399 0.01 sca2

来自@ r2evans的代码:

output_bin_LD = df %>%
  left_join(nest(df2, snp, .key = "snp"), by = c("y" = "sca")) %>%
  mutate(
    cuts = map(x, ~ seq(0, ., by = bin.size)),
    tbls = pmap(
      .l = list(snp, cuts),
      .f = function(xx, breaks) {
        z <- table(cut(xx$snp, breaks))
        data_frame(cut = names(z), count = z)
      }
    )
  ) %>%
  select(y, tbls) %>%
  unnest()

code正在执行此操作:

     y       cut count
1 sca1   (0,100]     4
2 sca1 (100,200]     0
3 sca1 (200,300]     0
4 sca2   (0,100]     4
5 sca2 (100,200]     0
6 sca2 (200,300]     0
7 sca2 (300,400]     1

最终目标是

     y       cut count  mean
1 sca1   (0,100]     4 0.575
2 sca1 (100,200]     0     0
3 sca1 (200,300]     0     0
4 sca2   (0,100]     4  0.87
5 sca2 (100,200]     0     0
6 sca2 (200,300]     0     0
7 sca2 (300,400]     1   399

到目前为止,我已经尝试过这个:

df %>%
  left_join(nest(df2, snp, r2, .key = "snp"), 
            by = c("y" = "sca")) %>%
  mutate(
    cuts = map(x, ~ seq(0, ., by = 100)),
    tbls = pmap(
      .l = list(snp, cuts),
      .f = function(xx, breaks) {
        z <- table(cut(xx$snp, breaks))
        a <- mean(cut(xx$r2, breaks))
        data_frame(cut = names(z), count = z, mean = a)
      } # .f 
    ) # closing pmap
  ) %>% # mutate
  select(y, tbls) %>%
  unnest()

但是它输出了我NA和一条警告信息:

     y       cut count mean
1 sca1   (0,100]     4   NA
2 sca1 (100,200]     0   NA
3 sca1 (200,300]     0   NA
4 sca2   (0,100]     4   NA
5 sca2 (100,200]     0   NA
6 sca2 (200,300]     0   NA
7 sca2 (300,400]     1   NA
Warning messages:
1: In mean.default(cut(xx$r2, breaks)) :
  argument is not numeric or logical: returning NA
2: In mean.default(cut(xx$r2, breaks)) :
  argument is not numeric or logical: returning NA

我该如何解决这个问题?我是否需要将桌子加倍?

2 个答案:

答案 0 :(得分:3)

这里是一个整齐的选项,它在dplyr上比purrr更重,这使得它更具可读性:

library(tidyverse)

df2 %>% group_by(sca, cut = cut(snp, seq(0, max(df$x), bin.size))) %>%   
    summarise(count = n(),    # For each group, count rows
              mean = mean(r2)) %>%    # and calculate mean
    # Add rows for every level of the cuts. Fill new rows with zeros.
    complete(cut, fill = list(count = 0L, mean = 0)) %>% 
    separate(cut, c('from', 'to'), sep = ',') %>%    # Split cut into two numbers
    mutate_at(vars(from:to), parse_number) %>%    # Extract numbers from strings
    left_join(df, c(sca = 'y')) %>%    # Join to get x value for each group
    filter(to <= x)    # Subset to rows where the max cut is within the range.

#> Source: local data frame [7 x 6]
#> Groups: sca [2]
#> 
#>     sca  from    to count  mean     x
#>   <chr> <dbl> <dbl> <int> <dbl> <dbl>
#> 1  sca1     0   100     4 0.575   300
#> 2  sca1   100   200     0 0.000   300
#> 3  sca1   200   300     0 0.000   300
#> 4  sca2     0   100     4 0.870   400
#> 5  sca2   100   200     0 0.000   400
#> 6  sca2   200   300     0 0.000   400
#> 7  sca2   300   400     1 0.010   400

你可以通过一点正则表达式和子集化来实际避免连接和乱码:

df2 %>% group_by(sca, cut = cut(snp, seq(0, max(df$x), bin.size))) %>% 
    summarise(count = n(), 
              mean = mean(r2)) %>% 
    complete(cut, fill = list(count = 0L, mean = 0)) %>% 
    filter(as.integer(gsub('.*,(\\d+).*', '\\1', cut)) <= df$x[unique(sca) == df$y])

#> Source: local data frame [7 x 4]
#> Groups: sca [2]
#> 
#>     sca       cut count  mean
#>   <chr>    <fctr> <int> <dbl>
#> 1  sca1   (0,100]     4 0.575
#> 2  sca1 (100,200]     0 0.000
#> 3  sca1 (200,300]     0 0.000
#> 4  sca2   (0,100]     4 0.870
#> 5  sca2 (100,200]     0 0.000
#> 6  sca2 (200,300]     0 0.000
#> 7  sca2 (300,400]     1 0.010

数据

df <- structure(list(x = c(300, 400), y = c("sca1", "sca2")), .Names = c("x", 
    "y"), row.names = c(NA, -2L), class = "data.frame")

df2 <- structure(list(snp = c(1L, 2L, 10L, 100L, 1L, 2L, 14L, 16L, 399L
    ), r2 = c(0.7, 0.8, 0.7, 0.1, 0.9, 0.98, 0.8, 0.8, 0.01), sca = c("sca1", 
    "sca1", "sca1", "sca1", "sca2", "sca2", "sca2", "sca2", "sca2"
    )), .Names = c("snp", "r2", "sca"), row.names = c(NA, -9L), class = "data.frame")

答案 1 :(得分:1)

不确定你的方法,但这里有一个稍微简单的方法..使用product_id包,如果你有兴趣的话。您需要最新版本(目前为1.10.0)才能生效(因为这是一项新功能)。

data.table
  1. 对于require(data.table) ## v1.9.8+ and <- b[a, on=.(sca=y, snp>start, snp<=end), ## 1 .(count=.N, mean=mean(r2, na.rm=TRUE)), ## 2 by=.EACHI] ## 3 中的每一行,找到a中匹配的行索引,同时匹配提供给b参数的条件。

  2. on == length(matching row indices)给出.Ncount给出匹配索引的mean()平均值。

  3. r2中的每一行都会运行(2)中的表达式。

  4. 其中,a是:

    a