我正在尝试使用ddply查找两个位置pos
之间的最小距离在两个数据框中相应的色度相同:
head(bps, 10)
chrom pos iteration
1 1 4 1
2 1 14 1
3 1 68 1
4 1 79 1
5 1 200 1
6 1 205 1
7 1 270 1
8 1 304 1
9 2 7 1
10 2 13 1
head(flocs)
chrom pos
1 1 100
2 1 200
3 1 220
4 1 312
5 2 500
6 2 501
例如,对于bps
中的第一行,我想在pos
中找到最接近的flocs
,其中chrom
= 1
,值为-96。
我要尝试执行的伪代码是:
foreach iteration (bps$iteration):
foreach chrom (bps$chrom):
foreach pos (bps$pos):
features_pos = pos in dataframe flocs closest to pos on the same chromosome
min_dist = feature_pos - pos
return features_pos, min_dist
我正在尝试使用ddply来做到这一点:
minDists <- ddply(bp_data, c("chrom", "pos"), function(x) {
index <- which.min(abs(flocs$pos[which(flocs$chrom==x$chrom)] - x$pos))
closestMotif <- flocs$pos[index]
chrom <- as.character(flocs$chrom[index])
dist <- (x$pos - closestMotif)
data.frame(features_pos = closestMotif, pos = x$pos, min_dist = dist, feature = feature)
})
但这并不限制与同一条染色体的比较:
head(minDists, 10)
chrom features_pos pos min_dist feature
1 1 100 4 -96 feature1
2 1 100 14 -86 feature1
3 1 100 68 -32 feature1
4 1 100 79 -21 feature1
5 1 200 200 0 feature1
6 1 200 205 5 feature1
7 1 312 270 -42 feature1
8 1 312 304 -8 feature1
9 2 100 7 -93 feature1 # bps chrom=2, flocs chrom=1
10 2 100 13 -87 feature1 # bps chrom=2, flocs chrom=1
此处的预期输出是:
chrom features_pos pos min_dist feature
1 1 100 4 -96 feature1
2 1 100 14 -86 feature1
3 1 100 68 -32 feature1
4 1 100 79 -21 feature1
5 1 200 200 0 feature1
6 1 200 205 5 feature1
7 1 312 270 -42 feature1
8 1 312 304 -8 feature1
9 2 500 7 -493 feature1 # bp1 chrom=2, flocs chrom=2
10 2 500 13 -487 feature1 # bp1 chrom=2, flocs chrom=2
我认为通过提供列c("chrom", "pos")
本质上对函数调用执行了group_by
。
有什么方法可以改善所写内容,以获得所需的结果?
bps <- structure(list(chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1", "2", "3"
), class = "factor"), pos = c(4L, 14L, 68L, 79L, 200L, 205L,
270L, 304L, 7L, 13L, 23L, 39L, 100L, 150L, 17L, 55L, 75L, 79L,
102L, 109L, 123L, 155L, 157L, 200L, 260L, 299L, 300L, 320L, 323L,
345L, 450L, 550L), iteration = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "1", class = "factor")), row.names = c(NA,
-32L), class = "data.frame")
flocs <- structure(list(chrom = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 3L,
3L), .Label = c("1", "2", "3"), class = "factor"), pos = c(100L,
200L, 220L, 312L, 500L, 501L, 123L, 444L)), row.names = c(NA,
-8L), class = "data.frame")
答案 0 :(得分:4)
data.table
使用滚动连接的方法...
更新的答案
(起初忘记了所有的按引用连接,它更快,而且肯定更短;-))
library( data.table )
#set data as data.table
setDT( bps, key = c("chrom", "pos") )
setDT( flocs, key = c("chrom", "pos") )
#perform by-reference rolling join
bps[, mindist := pos - flocs[bps, x.pos, roll = "nearest"]][]
输出
# chrom pos iteration mindist
# 1: 1 4 1 -96
# 2: 1 14 1 -86
# 3: 1 68 1 -32
# 4: 1 79 1 -21
# 5: 1 200 1 0
# 6: 1 205 1 5
# 7: 1 270 1 -42
# 8: 1 304 1 -8
# 9: 2 7 1 -493
# 10: 2 13 1 -487
# 11: 2 23 1 -477
# 12: 2 39 1 -461
# 13: 2 100 1 -400
# 14: 2 150 1 -350
# 15: 3 17 1 -106
# 16: 3 55 1 -68
# 17: 3 75 1 -48
# 18: 3 79 1 -44
# 19: 3 102 1 -21
# 20: 3 109 1 -14
# 21: 3 123 1 0
# 22: 3 155 1 32
# 23: 3 157 1 34
# 24: 3 200 1 77
# 25: 3 260 1 137
# 26: 3 299 1 -145
# 27: 3 300 1 -144
# 28: 3 320 1 -124
# 29: 3 323 1 -121
# 30: 3 345 1 -99
# 31: 3 450 1 6
# 32: 3 550 1 106
# chrom pos iteration mindist
# Unit: milliseconds
# expr min lq mean median uq max neval
# Ronak_base 2.355879 2.555768 2.973069 2.626415 2.773581 8.016016 100
# Wimpel_data.table 1.697921 2.035788 2.416199 2.209616 2.361001 17.724528 100
# Pawel_tidyverse 14.845354 15.310505 16.333158 15.814819 16.541618 24.077871 100
microbenchmark::microbenchmark(
Ronak_base = {
bps$min_dist <- unlist(mapply(return_min_value, unique(bps$chrom), split(bps$pos, bps$chrom)))
},
Wimpel_data.table = {
setDT( bps, key = c("chrom", "pos") )
setDT( flocs, key = c("chrom", "pos") )
#perform by-reference rolling join
bps[, mindist := pos - flocs[bps, x.pos, roll = "nearest"]][]
},
Pawel_tidyverse = {
bps %>%
select(-iteration) %>%
unite('bps') %>%
crossing(flocs %>% unite('flocks')) %>%
separate(bps, c('chrom_bps', 'pos')) %>%
separate(flocks, c('chrom_flocks', 'features_pos')) %>%
filter(chrom_bps == chrom_flocks) %>%
select(-chrom_flocks) %>%
rename_at(1, ~'chrom') %>%
mutate_all(as.numeric) %>%
mutate(min_dist = pos - features_pos) %>%
group_by(chrom, pos) %>%
filter(abs(min_dist) == min(abs(min_dist)))
}
)
好像我的数据表答案和Ronak Shah的答案非常接近。我相信,当数据集变得更大时,data.table将获得明显的优势(但我尚未测试)。
答案 1 :(得分:2)
我的基本R尝试通过创建辅助函数(return_min_value
)。此函数子集flocs
基于当前chrom
,然后从pos
中减去最小值后返回最小值。我们基于split
pos
chrom
列,并将这些值以及unique
中的chrom
mapply
值传递给return_min_value
函数。
return_min_value <- function(x, y) {
sapply(y, function(p) {
vals = p - flocs$pos[flocs$chrom == x]
vals[which.min(abs(vals))]
})
}
bps$min_dist <- unlist(mapply(return_min_value,
unique(bps$chrom), split(bps$pos, bps$chrom)))
bps
# chrom pos iteration min_dist
#1 1 4 1 -96
#2 1 14 1 -86
#3 1 68 1 -32
#4 1 79 1 -21
#5 1 200 1 0
#6 1 205 1 5
#7 1 270 1 -42
#8 1 304 1 -8
#9 2 7 1 -493
#10 2 13 1 -487
#...
答案 2 :(得分:0)
检查此解决方案:
library(tidyverse)
bps %>%
select(-iteration) %>%
unite('bps') %>%
crossing(flocs %>% unite('flocks')) %>%
separate(bps, c('chrom_bps', 'pos')) %>%
separate(flocks, c('chrom_flocks', 'features_pos')) %>%
filter(chrom_bps == chrom_flocks) %>%
select(-chrom_flocks) %>%
rename_at(1, ~'chrom') %>%
mutate_all(as.numeric) %>%
mutate(min_dist = pos - features_pos) %>%
group_by(chrom, pos) %>%
filter(abs(min_dist) == min(abs(min_dist)))
输出:
chrom pos features_pos min_dist
<dbl> <dbl> <dbl> <dbl>
1 1 4 100 -96
2 1 14 100 -86
3 1 68 100 -32
4 1 79 100 -21
5 1 200 200 0
6 1 205 200 5
7 1 270 312 -42
8 1 304 312 -8
9 2 7 500 -493
10 2 13 500 -487
# ... with 22 more rows