我正在尝试使用left_join
和nest
计算分箱数据的平均值。
bin.size = 100
第一个数据帧:
df = data.frame(x =c(300,400),
y = c("sca1","sca2"))
x y
1 300 sca1
2 400 sca2
第二个数据帧:
df2 = data.frame(snp = c(1,2,10,100,1,2,14,16,399),
sca = c("sca1","sca1","sca1","sca1","sca2","sca2","sca2","sca2","sca2"))
snp r2 sca
1 1 0.70 sca1
2 2 0.80 sca1
3 10 0.70 sca1
4 100 0.10 sca1
5 1 0.90 sca2
6 2 0.98 sca2
7 14 0.80 sca2
8 16 0.80 sca2
9 399 0.01 sca2
来自@ r2evans的代码:
output_bin_LD = df %>%
left_join(nest(df2, snp, .key = "snp"), by = c("y" = "sca")) %>%
mutate(
cuts = map(x, ~ seq(0, ., by = bin.size)),
tbls = pmap(
.l = list(snp, cuts),
.f = function(xx, breaks) {
z <- table(cut(xx$snp, breaks))
data_frame(cut = names(z), count = z)
}
)
) %>%
select(y, tbls) %>%
unnest()
此code正在执行此操作:
y cut count
1 sca1 (0,100] 4
2 sca1 (100,200] 0
3 sca1 (200,300] 0
4 sca2 (0,100] 4
5 sca2 (100,200] 0
6 sca2 (200,300] 0
7 sca2 (300,400] 1
最终目标是
y cut count mean
1 sca1 (0,100] 4 0.575
2 sca1 (100,200] 0 0
3 sca1 (200,300] 0 0
4 sca2 (0,100] 4 0.87
5 sca2 (100,200] 0 0
6 sca2 (200,300] 0 0
7 sca2 (300,400] 1 399
到目前为止,我已经尝试过这个:
df %>%
left_join(nest(df2, snp, r2, .key = "snp"),
by = c("y" = "sca")) %>%
mutate(
cuts = map(x, ~ seq(0, ., by = 100)),
tbls = pmap(
.l = list(snp, cuts),
.f = function(xx, breaks) {
z <- table(cut(xx$snp, breaks))
a <- mean(cut(xx$r2, breaks))
data_frame(cut = names(z), count = z, mean = a)
} # .f
) # closing pmap
) %>% # mutate
select(y, tbls) %>%
unnest()
但是它输出了我NA
和一条警告信息:
y cut count mean
1 sca1 (0,100] 4 NA
2 sca1 (100,200] 0 NA
3 sca1 (200,300] 0 NA
4 sca2 (0,100] 4 NA
5 sca2 (100,200] 0 NA
6 sca2 (200,300] 0 NA
7 sca2 (300,400] 1 NA
Warning messages:
1: In mean.default(cut(xx$r2, breaks)) :
argument is not numeric or logical: returning NA
2: In mean.default(cut(xx$r2, breaks)) :
argument is not numeric or logical: returning NA
我该如何解决这个问题?我是否需要将桌子加倍?
答案 0 :(得分:3)
这里是一个整齐的选项,它在dplyr上比purrr更重,这使得它更具可读性:
library(tidyverse)
df2 %>% group_by(sca, cut = cut(snp, seq(0, max(df$x), bin.size))) %>%
summarise(count = n(), # For each group, count rows
mean = mean(r2)) %>% # and calculate mean
# Add rows for every level of the cuts. Fill new rows with zeros.
complete(cut, fill = list(count = 0L, mean = 0)) %>%
separate(cut, c('from', 'to'), sep = ',') %>% # Split cut into two numbers
mutate_at(vars(from:to), parse_number) %>% # Extract numbers from strings
left_join(df, c(sca = 'y')) %>% # Join to get x value for each group
filter(to <= x) # Subset to rows where the max cut is within the range.
#> Source: local data frame [7 x 6]
#> Groups: sca [2]
#>
#> sca from to count mean x
#> <chr> <dbl> <dbl> <int> <dbl> <dbl>
#> 1 sca1 0 100 4 0.575 300
#> 2 sca1 100 200 0 0.000 300
#> 3 sca1 200 300 0 0.000 300
#> 4 sca2 0 100 4 0.870 400
#> 5 sca2 100 200 0 0.000 400
#> 6 sca2 200 300 0 0.000 400
#> 7 sca2 300 400 1 0.010 400
你可以通过一点正则表达式和子集化来实际避免连接和乱码:
df2 %>% group_by(sca, cut = cut(snp, seq(0, max(df$x), bin.size))) %>%
summarise(count = n(),
mean = mean(r2)) %>%
complete(cut, fill = list(count = 0L, mean = 0)) %>%
filter(as.integer(gsub('.*,(\\d+).*', '\\1', cut)) <= df$x[unique(sca) == df$y])
#> Source: local data frame [7 x 4]
#> Groups: sca [2]
#>
#> sca cut count mean
#> <chr> <fctr> <int> <dbl>
#> 1 sca1 (0,100] 4 0.575
#> 2 sca1 (100,200] 0 0.000
#> 3 sca1 (200,300] 0 0.000
#> 4 sca2 (0,100] 4 0.870
#> 5 sca2 (100,200] 0 0.000
#> 6 sca2 (200,300] 0 0.000
#> 7 sca2 (300,400] 1 0.010
df <- structure(list(x = c(300, 400), y = c("sca1", "sca2")), .Names = c("x",
"y"), row.names = c(NA, -2L), class = "data.frame")
df2 <- structure(list(snp = c(1L, 2L, 10L, 100L, 1L, 2L, 14L, 16L, 399L
), r2 = c(0.7, 0.8, 0.7, 0.1, 0.9, 0.98, 0.8, 0.8, 0.01), sca = c("sca1",
"sca1", "sca1", "sca1", "sca2", "sca2", "sca2", "sca2", "sca2"
)), .Names = c("snp", "r2", "sca"), row.names = c(NA, -9L), class = "data.frame")
答案 1 :(得分:1)
不确定你的方法,但这里有一个稍微简单的方法..使用product_id
包,如果你有兴趣的话。您需要最新版本(目前为1.10.0)才能生效(因为这是一项新功能)。
data.table
对于require(data.table) ## v1.9.8+
and <- b[a, on=.(sca=y, snp>start, snp<=end), ## 1
.(count=.N, mean=mean(r2, na.rm=TRUE)), ## 2
by=.EACHI] ## 3
中的每一行,找到a
中匹配的行索引,同时匹配提供给b
参数的条件。
on
== length(matching row indices)
给出.N
,count
给出匹配索引的mean()
平均值。
r2
中的每一行都会运行(2)
中的表达式。
其中,a
是:
a