基准测试直到现在

Question

我正在尝试使用ddply查找两个位置pos之间的最小距离在两个数据框中相应的色度相同：

head(bps, 10)
   chrom pos iteration
1      1   4         1
2      1  14         1
3      1  68         1
4      1  79         1
5      1 200         1
6      1 205         1
7      1 270         1
8      1 304         1
9      2   7         1
10     2  13         1

head(flocs)
  chrom pos
1     1 100
2     1 200
3     1 220
4     1 312
5     2 500
6     2 501

例如，对于bps中的第一行，我想在pos中找到最接近的flocs，其中chrom = 1，值为-96。

我要尝试执行的伪代码是：

foreach iteration (bps$iteration):
  foreach chrom (bps$chrom):
    foreach pos (bps$pos):
      features_pos = pos in dataframe flocs closest to pos on the same chromosome
      min_dist     = feature_pos - pos

      return features_pos, min_dist

我正在尝试使用ddply来做到这一点：

minDists <- ddply(bp_data, c("chrom", "pos"), function(x) {
    index <-  which.min(abs(flocs$pos[which(flocs$chrom==x$chrom)] - x$pos))
    closestMotif <- flocs$pos[index]
    chrom <- as.character(flocs$chrom[index])
    dist <- (x$pos - closestMotif)
    data.frame(features_pos = closestMotif, pos = x$pos, min_dist = dist, feature = feature)
  })

但这并不限制与同一条染色体的比较：

head(minDists, 10)

   chrom features_pos pos min_dist  feature
1      1          100   4      -96 feature1
2      1          100  14      -86 feature1
3      1          100  68      -32 feature1
4      1          100  79      -21 feature1
5      1          200 200        0 feature1
6      1          200 205        5 feature1
7      1          312 270      -42 feature1
8      1          312 304       -8 feature1
9      2          100   7      -93 feature1 # bps chrom=2, flocs chrom=1
10     2          100  13      -87 feature1 # bps chrom=2, flocs chrom=1

此处的预期输出是：

   chrom features_pos pos min_dist  feature
1      1          100   4      -96 feature1
2      1          100  14      -86 feature1
3      1          100  68      -32 feature1
4      1          100  79      -21 feature1
5      1          200 200        0 feature1
6      1          200 205        5 feature1
7      1          312 270      -42 feature1
8      1          312 304       -8 feature1
9      2          500   7      -493 feature1 # bp1 chrom=2, flocs chrom=2
10     2          500  13      -487 feature1 # bp1 chrom=2, flocs chrom=2

我认为通过提供列c("chrom", "pos")本质上对函数调用执行了group_by。

有什么方法可以改善所写内容，以获得所需的结果？

bps <- structure(list(chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1", "2", "3"
), class = "factor"), pos = c(4L, 14L, 68L, 79L, 200L, 205L, 
270L, 304L, 7L, 13L, 23L, 39L, 100L, 150L, 17L, 55L, 75L, 79L, 
102L, 109L, 123L, 155L, 157L, 200L, 260L, 299L, 300L, 320L, 323L, 
345L, 450L, 550L), iteration = structure(c(1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "1", class = "factor")), row.names = c(NA, 
-32L), class = "data.frame")

flocs <- structure(list(chrom = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 
3L), .Label = c("1", "2", "3"), class = "factor"), pos = c(100L, 
200L, 220L, 312L, 500L, 501L, 123L, 444L)), row.names = c(NA, 
-8L), class = "data.frame")

Answer 1

data.table使用滚动连接的方法...

更新的答案

（起初忘记了所有的按引用连接，它更快，而且肯定更短；-)）

library( data.table )
#set data as data.table
setDT( bps, key = c("chrom", "pos") )
setDT( flocs, key = c("chrom", "pos") )
#perform by-reference rolling join
bps[, mindist := pos - flocs[bps, x.pos, roll = "nearest"]][]

输出

#     chrom pos iteration mindist
#  1:     1   4         1     -96
#  2:     1  14         1     -86
#  3:     1  68         1     -32
#  4:     1  79         1     -21
#  5:     1 200         1       0
#  6:     1 205         1       5
#  7:     1 270         1     -42
#  8:     1 304         1      -8
#  9:     2   7         1    -493
# 10:     2  13         1    -487
# 11:     2  23         1    -477
# 12:     2  39         1    -461
# 13:     2 100         1    -400
# 14:     2 150         1    -350
# 15:     3  17         1    -106
# 16:     3  55         1     -68
# 17:     3  75         1     -48
# 18:     3  79         1     -44
# 19:     3 102         1     -21
# 20:     3 109         1     -14
# 21:     3 123         1       0
# 22:     3 155         1      32
# 23:     3 157         1      34
# 24:     3 200         1      77
# 25:     3 260         1     137
# 26:     3 299         1    -145
# 27:     3 300         1    -144
# 28:     3 320         1    -124
# 29:     3 323         1    -121
# 30:     3 345         1     -99
# 31:     3 450         1       6
# 32:     3 550         1     106
#     chrom pos iteration mindist

基准测试直到现在

# Unit: milliseconds
#              expr       min        lq      mean    median        uq       max neval
#        Ronak_base  2.355879  2.555768  2.973069  2.626415  2.773581  8.016016   100
# Wimpel_data.table  1.697921  2.035788  2.416199  2.209616  2.361001 17.724528   100
#   Pawel_tidyverse 14.845354 15.310505 16.333158 15.814819 16.541618 24.077871   100


microbenchmark::microbenchmark(
  Ronak_base = {
    bps$min_dist <- unlist(mapply(return_min_value, unique(bps$chrom), split(bps$pos, bps$chrom)))
  },
  Wimpel_data.table = {
    setDT( bps, key = c("chrom", "pos") )
    setDT( flocs, key = c("chrom", "pos") )
    #perform by-reference rolling join
    bps[, mindist := pos - flocs[bps, x.pos, roll = "nearest"]][]
  },
  Pawel_tidyverse = {
    bps %>%
      select(-iteration) %>%
      unite('bps') %>%
      crossing(flocs %>% unite('flocks')) %>%
      separate(bps, c('chrom_bps', 'pos')) %>%
      separate(flocks, c('chrom_flocks', 'features_pos')) %>%
      filter(chrom_bps == chrom_flocks) %>%
      select(-chrom_flocks) %>%
      rename_at(1, ~'chrom') %>%
      mutate_all(as.numeric) %>%
      mutate(min_dist = pos - features_pos) %>%
      group_by(chrom, pos) %>%
      filter(abs(min_dist) == min(abs(min_dist)))
  }
)

好像我的数据表答案和Ronak Shah的答案非常接近。我相信，当数据集变得更大时，data.table将获得明显的优势（但我尚未测试）。

Answer 2

我的基本R尝试通过创建辅助函数（return_min_value）。此函数子集flocs基于当前chrom，然后从pos中减去最小值后返回最小值。我们基于split pos chrom列，并将这些值以及unique中的chrom mapply值传递给return_min_value函数。

return_min_value <- function(x, y) {
   sapply(y, function(p) {
     vals = p - flocs$pos[flocs$chrom == x]
     vals[which.min(abs(vals))]
  })
}

bps$min_dist <- unlist(mapply(return_min_value,
                       unique(bps$chrom), split(bps$pos, bps$chrom)))


bps
#   chrom pos iteration min_dist
#1      1   4         1      -96
#2      1  14         1      -86
#3      1  68         1      -32
#4      1  79         1      -21
#5      1 200         1        0
#6      1 205         1        5
#7      1 270         1      -42
#8      1 304         1       -8
#9      2   7         1     -493
#10     2  13         1     -487
#...

Answer 3

检查此解决方案：

library(tidyverse)

bps %>%
  select(-iteration) %>%
  unite('bps') %>%
  crossing(flocs %>% unite('flocks')) %>%
  separate(bps, c('chrom_bps', 'pos')) %>%
  separate(flocks, c('chrom_flocks', 'features_pos')) %>%
  filter(chrom_bps == chrom_flocks) %>%
  select(-chrom_flocks) %>%
  rename_at(1, ~'chrom') %>%
  mutate_all(as.numeric) %>%
  mutate(min_dist = pos - features_pos) %>%
  group_by(chrom, pos) %>%
  filter(abs(min_dist) == min(abs(min_dist)))

输出：

  chrom   pos features_pos min_dist
   <dbl> <dbl>        <dbl>    <dbl>
 1     1     4          100      -96
 2     1    14          100      -86
 3     1    68          100      -32
 4     1    79          100      -21
 5     1   200          200        0
 6     1   205          200        5
 7     1   270          312      -42
 8     1   304          312       -8
 9     2     7          500     -493
10     2    13          500     -487
# ... with 22 more rows

ddply使用“ group_by”逻辑

3 个答案:

基准测试直到现在