Question

我有两个大型数据集存储在具有相同尺寸的列表（L1，L2）中。在每个列表元素L [[。]]中，都有一个数据帧。我想将两个列表按列相乘，然后将L1 [[i]]的所有组合的一些结果相乘。 L2 [[j]]表示 i 和 j 可能取的任何值。

有关一些说明，请参见下面的代码

#Data Generation-----------------------
set.seed(200)
a = matrix(sample(50,16), 4)
b = matrix(sample(50,16), 4)
c = matrix(sample(50,16), 4)
d = matrix(sample(50,16), 4)

#Creating index and binding------------
r = cbind(rep(1,4),a)
s = cbind(rep(2,4),b)
o = as.data.frame(rbind(r,s))
t = cbind(rep(1,4),c)
u = cbind(rep(2,4),d)
p = as.data.frame(rbind(t,u))

#Splitting the data -------------------
list.1 = split(o, o$V1) #o$V1 is the index column
list.2 = split(p, p$V1) #o$V1 is the index column

现在，我想在R中复制Excel函数SUMPRODUCT。

例如，从上面的数据中，我想要一个简单的数据框，如下所示：

#Col1      #Col2
sum(a*c)   sum(b*c)
sum(a*d)   sum(b*d)

问题在于 a ， b ， c 和 d 现在都在列表内。我尝试了 for-loops ，但无济于事。我尝试使用 split-apply-combine 方法，但我也没有成功。

由于两个列表均具有9000个元素，因此我正在寻找一种优化的做到这一点的方法。我该怎么做？

谢谢。

编辑：根据用户R的要求，在每个列表中扩展3个元素的示例

#Data Generation-----------------------
set.seed(200)
a = matrix(sample(50,16), 4)
b = matrix(sample(50,16), 4)
c = matrix(sample(50,16), 4)
d = matrix(sample(50,16), 4)
e = matrix(sample(50,16), 4)
f = matrix(sample(50,16), 4)

#Creating index and binding------------
r = cbind(rep(1,4),a)
s = cbind(rep(2,4),b)
m = cbind(rep(3,4),c)
o = as.data.frame(rbind(r,s,m))

t = cbind(rep(1,4),d)
u = cbind(rep(2,4),e)
v = cbind(rep(3,4),f)
p = as.data.frame(rbind(t,u,v))

#Splitting the data -------------------
list.1 = split(o, o$V1) #o$V1 is the index column
list.2 = split(p, p$V1) #o$V1 is the index column

现在每个列表都有3个元素，每个元素包含一个4x4矩阵。

我要寻找的结果的结构如下：

#Col1      #Col2      #Col3
sum(a*d)   sum(b*d)   sum(c*d)
sum(a*e)   sum(b*e)   sum(c*e)
sum(a*f)   sum(b*f)   sum(c*f)

对于我真正的问题，结果将是一个具有9000列和9000行的数组（矩阵或数据框）。

编辑：按照用户R的建议添加真实数据

useR建议我显示一些真实数据，以便人们知道我的真实数据的样子：

dput(list.1[1:3])

list(`1` = structure(list(vol = c(1425.76, 272.52, 0, 0, 31912.78, 
6056.18, 8212.88, 3909.3, 0, 761.06, 22.45, 237.18), i_1 = c(2819.81, 
4026.72, 827.2, 4790.52, 12218.1, 3632.64, 6308.66, 4076.71, 
2192.98, 952.94, 112.84, 170.97), i_2 = c(2857.88, 2914.34, 761.87, 
4412.4, 11046.36, 2363.24, 7761.31, 5431.03, 1337.62, 857, 103.46, 
110.33), i_3 = c(1389.12, 932.86, 238.51, 5046, 5298.57, 3087.9, 
8746.02, 7129.57, 708.53, 549.1, 86.58, 163.15), i_4 = c(1626.96, 
936.04, 377.81, 4909.62, 6323.5, 2766.49, 3746.06, 2858.07, 900.29, 
975.21, 102.76, 295.1), i_5 = c(1653.05, 1724.74, 321.59, 3937.2, 
6966.48, 2614.67, 3326.99, 2371.44, 1082.43, 970.25, 123.51, 
491.92), i_6 = c(1584.14, 3399.31, 392.24, 3957.88, 8042.5, 2614.46, 
2371.67, 1896.1, 1201.83, 1314.06, 161.23, 892.91)), row.names = c(NA, 
12L), class = "data.frame"), `2` = structure(list(vol = c(10774.34, 
287.53, 0, 0, 57507.79, 10692.91, 9028.38, 10355.78, 8900.38, 
3253.59, 22.45, 219), i_1 = c(5760.16, 4315.77, 585.28, 2886.11, 
23767.55, 3095.39, 6705.94, 6445.96, 10612.49, 2470.32, 126.65, 
143.46), i_2 = c(5035.23, 1785.77, 405.05, 4492.64, 21509.39, 
3654.16, 10203.03, 9505.1, 6628.42, 1298.06, 111.76, 110.13), 
    i_3 = c(2798.54, 1920.72, 464.92, 7916.61, 13628.15, 8365.88, 
    18425.9, 22368.93, 2253.38, 1078.65, 101.34, 134.98), i_4 = c(2344.65, 
    1407.02, 369.7, 2889.69, 7618.72, 2110.01, 4982.27, 2250.94, 
    1744.4, 1033.89, 105.74, 212.26), i_5 = c(1653.05, 1724.74, 
    321.59, 3937.2, 6966.48, 2614.67, 3326.99, 2371.44, 1082.43, 
    970.25, 123.51, 491.92), i_6 = c(1584.14, 3399.31, 392.24, 
    3957.88, 8042.5, 2614.46, 2371.67, 1896.1, 1201.83, 1314.06, 
    161.23, 892.91)), row.names = 13:24, class = "data.frame"), 
    `3` = structure(list(vol = c(850.15, 218.58, 0, 0, 38959.27, 
    3081.31, 3441.35, 2760.54, 0, 2826.8, 0, 34.12), i_1 = c(6048.28, 
    3545.14, 1566.05, 2866.46, 20149.24, 1459.03, 2051.68, 2047.74, 
    5059.57, 3369.86, 129.37, 361.49), i_2 = c(1728.12, 1530.14, 
    364.37, 4761.97, 6934.24, 1802.96, 5394.15, 3972.96, 510.25, 
    989.3, 109.05, 322.37), i_3 = c(1182.59, 750.55, 311.19, 
    5540.96, 4544.96, 2535.07, 8926.35, 7209.61, 423.66, 446.62, 
    92.21, 282.36), i_4 = c(1179.1, 645.18, 283.97, 4616.66, 
    5063.41, 3110.14, 9240.41, 4752.86, 744.85, 648.44, 100.52, 
    311.67), i_5 = c(1653.05, 1724.74, 321.59, 3937.2, 6966.48, 
    2614.67, 3326.99, 2371.44, 1082.43, 970.25, 123.51, 491.92
    ), i_6 = c(1584.14, 3399.31, 392.24, 3957.88, 8042.5, 2614.46, 
    2371.67, 1896.1, 1201.83, 1314.06, 161.23, 892.91)), row.names = 25:36, class = "data.frame"))

dput(list.2[1:3])

list(`1` = structure(list(Index = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,  1L,
1L, 1L, 1L, 1L), itenm1 = c(998399, 998399, 998399, 998399,  998399,
998399, 998399, 998399, 998399, 998399, 998399, 998399 ), j_1 =
c(-261.62831, -605.82802, -190.35225, -802.27542, -835.07636, 
-709.70814, -444.26492, -207.96871, -986.93606, -968.29324, -7675.97567, 
-1271.43424), j_2 = c(0, -188.67302, 0, -799.17034, 0, 247.70379,  0, 0, 1051.71715, -27.94787, 0, -13.34628), j_3 = c(0, 0, 0,  0, 0,
-207.58736, 0, 0, -2333.43115, -1346.57579, 0, -205.13053 ), j_4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), j_5 = c(0, 0,  0, 0, 0, 0, 0,
0, 0, 0, 0, 0), j_6 = c(0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0)),
row.names = c(NA, 12L), class = "data.frame"), 
    `2` = structure(list(Index = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L), itenm1 = c(998398, 998398, 998398, 998398, 
    998398, 998398, 998398, 998398, 998398, 998398, 998398, 998398
    ), j_1 = c(-106.64606, -203.78915, -76.30121, -310.10454, 
    -321.62536, -227.3462, -160.82221, -70.87354, -286.94001, 
    -137.28382, -3779.42484, -604.71574), j_2 = c(0, -96.94433, 
   0, -297.21757, 0, 67.67053, 0, 0, 309.38773, -8.42931, 0, 
   -6.7299), j_3 = c(0, 0, 0, 0, 0, -56.71107, 0, 0, -686.43453, 
   -406.13843, 0, -103.43761), j_4 = c(0, 0, 0, 0, 0, 0, 0, 
   0, 0, 0, 0, 0), j_5 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
   0), j_6 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = 13:24, class = "data.frame"), 
   `3` = structure(list(Index = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 
   3L, 3L, 3L, 3L, 3L), itenm1 = c(998397, 998397, 998397, 998397, 
   998397, 998397, 998397, 998397, 998397, 998397, 998397, 998397
   ), j_1 = c(-238.10472, -543.97414, -71.04739, -756.58841, 
   -782.7918, -667.84871, -424.38314, -193.82405, -638.12855, 
   -319.65804, -6693.88425, -1189.81911), j_2 = c(0, -182.11783, 
   0, -750.99738, 0, 233.80836, 0, 0, 683.61007, -18.48993, 
   0, -11.46144), j_3 = c(0, 0, 0, 0, 0, -195.94234, 0, 0, -1516.71678, 
   -890.87644, 0, -176.16082), j_4 = c(0, 0, 0, 0, 0, 0, 0, 
   0, 0, 0, 0, 0), j_5 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
   0), j_6 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = 25:36, class = "data.frame"))

Answer 1

下面，我提出了三种方法及其基准：mapply，嵌套的for循环以及RcppArmadillo。

数据：

#Data Generation-----------------------
set.seed(200)
a = matrix(sample(50,16), 4)
b = matrix(sample(50,16), 4)
c = matrix(sample(50,16), 4)
d = matrix(sample(50,16), 4)
e = matrix(sample(50,16), 4)
f = matrix(sample(50,16), 4)

#Creating index and binding------------
r = cbind(rep(1,4),a)
s = cbind(rep(2,4),b)
m = cbind(rep(3,4),c)
o = as.data.frame(rbind(r,s,m))

t = cbind(rep(1,4),d)
u = cbind(rep(2,4),e)
v = cbind(rep(3,4),f)
p = as.data.frame(rbind(t,u,v))

#Splitting the data -------------------
list.1 = split(o[,-1], o$V1) #o$V1 is the index column
list.2 = split(p[,-1], p$V1) #o$V1 is the index column

list.1 = lapply(list.1, as.matrix)
list.2 = lapply(list.2, as.matrix)

在这里，我通过先删除id列并将list.1和list.2的每个元素转换为矩阵来作弊。这样做可以提高我们函数的性能。

初始化功能：

# Rcpp -----------------------------------------------
library(Rcpp)
library(RcppArmadillo)

cppFunction(depends = "RcppArmadillo",
"arma::mat sumprod_Rcpp(List x, List y){
  List xlist(x);
  List ylist(y);
  int n = xlist.size();
  arma::mat m(n,n);

  for(int i=0; i<n; i++) {
    for(int j=0; j<n; j++){
      arma::mat xMat = xlist[i];
      arma::mat yMat = ylist[j];
      arma::vec v = arma::vectorise(xMat*yMat);
      m(j,i) = sum(v);
    }
  }
  return(m);
}
"
)

# Nested For -----------------------------------------
sumprod_for <- function(x, y){
  mat <- matrix(NA,length(list.1),length(list.1))

  for(i in 1:length(list.1)){
    for(j in 1:length(list.1)){
      mat[j,i] <- sum(x[[i]] %*% y[[j]])
    }
  }
  return(mat)
}

# Mapply ---------------------------------------------
sumprod_mapply <- function(x, y){
  matrix(mapply(function(j, k){
    sum(x[[j]] %*% y[[k]])
  }, 
  rep(1:length(list.1), each = length(list.1)), 
  rep(1:length(list.1), length(list.1))
  ), 
  length(list.1), 
  length(list.1)
  )
}  

# Ryan's sapply --------------------------------------
sumprod_sapply <- function(x, y){
  sapply(x, function(j){
    lapply(y, function(k) sum(j %*% k))
  })
}

检查输出是否相同：

identical(sumprod_mapply(list.1, list.2), matrix(unlist(sumprod_sapply(list.1, list.2)), length(list.1), length(list.1)))
# [1] TRUE
identical(sumprod_mapply(list.1, list.2), sumprod_for(list.1, list.2))
# [1] TRUE
identical(sumprod_mapply(list.1, list.2), sumprod_Rcpp(list.1, list.2))
# [1] TRUE

sumprod_Rcpp(list.1, list.2)
#       [,1]  [,2]  [,3]
# [1,] 44882 40505 49670
# [2,] 29750 26897 32260
# [3,] 45898 41248 50847

sumprod_for(list.1, list.2)
#       [,1]  [,2]  [,3]
# [1,] 44882 40505 49670
# [2,] 29750 26897 32260
# [3,] 45898 41248 50847

sumprod_mapply(list.1, list.2)
#       [,1]  [,2]  [,3]
# [1,] 44882 40505 49670
# [2,] 29750 26897 32260
# [3,] 45898 41248 50847

sumprod_sapply(list.1, list.2)
#   1     2     3    
# 1 44882 40505 49670
# 2 29750 26897 32260
# 3 45898 41248 50847

基准化：

library(microbenchmark)
microbenchmark(sumprod_mapply(list.1, list.2), 
               sumprod_sapply(list.1, list.2),
               sumprod_for(list.1, list.2),
               sumprod_Rcpp(list.1, list.2),
               times = 10000L)

结果：

Unit: microseconds
                           expr    min     lq      mean median     uq      max neval
 sumprod_mapply(list.1, list.2) 34.345 39.082 51.274501 41.846 62.373 2448.292 10000
 sumprod_sapply(list.1, list.2) 37.108 42.635 56.119414 45.398 67.504 2324.733 10000
    sumprod_for(list.1, list.2) 10.264 13.422 17.685540 15.001 22.502  120.008 10000
   sumprod_Rcpp(list.1, list.2)  2.369  3.948  5.247494  4.738  6.317   88.032 10000

寻找性能提升时，Rcpp的实现不会出错。但是令人惊讶的是，sumprod_for比sumprod_mapply和sumprod_sapply快得多，这可能是由于mapply隐式地将列表输出强制为向量。随时提出更多解决方案，我将它们添加到基准中。

在R？

1 个答案: