比较多行数据(快速)

时间:2017-12-13 18:17:19

标签: r data.table

如何快速比较成对的数据行?这是一个MWE,但由于我有近10,000行(amount.of.baskets),它太慢了。

MWE计算从一个篮子到另一个篮子需要添加或减少的水果数量。

library(data.table)
set.seed(123)
amount.of.baskets <- 10
the.baskets <- data.table(apples=sample(x=c(0:9), amount.of.baskets, replace=TRUE), oranges=sample(x=c(0:9), amount.of.baskets, replace=TRUE), pears=sample(x=c(0:9), amount.of.baskets, replace=TRUE), bananas=sample(x=c(0:9), amount.of.baskets, replace=TRUE), pineapples=sample(x=c(0:9), amount.of.baskets, replace=TRUE), avocados=sample(x=c(0:9), amount.of.baskets, replace=TRUE), nectarines=sample(x=c(0:9), amount.of.baskets, replace=TRUE), limes=sample(x=c(0:9), amount.of.baskets, replace=TRUE), cherries=sample(x=c(0:9), amount.of.baskets, replace=TRUE), melons=sample(x=c(0:9), amount.of.baskets, replace=TRUE))

basket.diff.table <- NULL
for (from.i in 1:amount.of.baskets) {
    for (to.i in 1:amount.of.baskets) {
        tmp.i <- rbind(((-1)*the.baskets[from.i, ]), the.baskets[to.i, ])
        tmp.sum <- data.table(t(colSums(tmp.i)))
        tmp.sum[, c("from.basket", "to.basket"):=list(from.i, to.i)]
        basket.diff.table <- rbind(basket.diff.table, tmp.sum)
    }
}

basket.diff.table是所需的输出。

1 个答案:

答案 0 :(得分:3)

要将函数应用于两个向量的所有组合,通常可以使用outer

outer_diffs <- lapply(
  the.baskets,
  FUN = function(x) {
    as.vector(outer(x, x, "-"))
  }
)

str(outer_diffs)
# List of 10
#  $ apples    : int [1:100] 0 5 2 6 7 -2 3 6 3 2 ...
#  $ oranges   : int [1:100] 0 -5 -3 -4 -8 -1 -7 -9 -6 0 ...
#  $ pears     : int [1:100] 0 -2 -2 1 -2 -1 -3 -3 -6 -7 ...
#  $ bananas   : int [1:100] 0 0 -3 -2 -9 -5 -2 -7 -6 -7 ...
#  $ pineapples: int [1:100] 0 3 3 2 0 0 1 3 1 7 ...
#  $ avocados  : int [1:100] 0 4 7 1 5 2 1 7 8 3 ...
#  $ nectarines: int [1:100] 0 -6 -3 -4 2 -2 2 2 1 -2 ...
#  $ limes     : int [1:100] 0 -1 0 -7 -3 -5 -4 -1 -4 -6 ...
#  $ cherries  : int [1:100] 0 4 2 5 -1 2 7 6 6 -1 ...
#  $ melons    : int [1:100] 0 5 2 5 2 0 6 -1 3 4 ...

basket.diff.table <- as.data.table(outer_diffs)
basket_indices <- seq_len(nrow(the.baskets))
basket.diff.table[, ":="(
  from.basket = rep(basket_indices, each  = nrow(the.baskets)),
  to.basket   = rep(basket_indices, times = nrow(the.baskets))
)]

basket.diff.table[1:5]
#    apples oranges pears bananas pineapples avocados nectarines limes cherries melons from.basket to.basket
# 1:      0       0     0       0          0        0          0     0        0      0           1         1
# 2:      5      -5    -2       0          3        4         -6    -1        4      5           1         2
# 3:      2      -3    -2      -3          3        7         -3     0        2      2           1         3
# 4:      6      -4     1      -2          2        1         -4    -7        5      5           1         4
# 5:      7      -8    -2      -9          0        5          2    -3       -1      2           1         5

<强>更新

我将上述内容作为我的答案,因为它更像是对解决方案的逐步演示。但是,正如@Henrik指出的那样,通过使用data.table包执行更多操作,可以缩短和加快此代码的速度。他的版本:

n <- nrow(the.baskets)
basket.diff.table2 <- the.baskets[, c(
  lapply(.SD, function(x) as.vector(outer(x, x, "-"))),
  CJ(from.basket = 1:n, to.basket = 1:n)
)]