我有两个大型数据集存储在具有相同尺寸的列表(L1,L2)中。在每个列表元素L [[。]]中,都有一个数据帧。我想将两个列表按列相乘,然后将L1 [[i]]的所有组合的一些结果相乘。 L2 [[j]]表示 i 和 j 可能取的任何值。
有关一些说明,请参见下面的代码
#Data Generation-----------------------
set.seed(200)
a = matrix(sample(50,16), 4)
b = matrix(sample(50,16), 4)
c = matrix(sample(50,16), 4)
d = matrix(sample(50,16), 4)
#Creating index and binding------------
r = cbind(rep(1,4),a)
s = cbind(rep(2,4),b)
o = as.data.frame(rbind(r,s))
t = cbind(rep(1,4),c)
u = cbind(rep(2,4),d)
p = as.data.frame(rbind(t,u))
#Splitting the data -------------------
list.1 = split(o, o$V1) #o$V1 is the index column
list.2 = split(p, p$V1) #o$V1 is the index column
现在,我想在R中复制Excel函数SUMPRODUCT。
例如,从上面的数据中,我想要一个简单的数据框,如下所示:
#Col1 #Col2
sum(a*c) sum(b*c)
sum(a*d) sum(b*d)
问题在于 a , b , c 和 d 现在都在列表内。我尝试了 for-loops ,但无济于事。我尝试使用 split-apply-combine 方法,但我也没有成功。
由于两个列表均具有9000个元素,因此我正在寻找一种优化的 做到这一点的方法。我该怎么做?
谢谢。
编辑:根据用户R的要求,在每个列表中扩展3个元素的示例
#Data Generation-----------------------
set.seed(200)
a = matrix(sample(50,16), 4)
b = matrix(sample(50,16), 4)
c = matrix(sample(50,16), 4)
d = matrix(sample(50,16), 4)
e = matrix(sample(50,16), 4)
f = matrix(sample(50,16), 4)
#Creating index and binding------------
r = cbind(rep(1,4),a)
s = cbind(rep(2,4),b)
m = cbind(rep(3,4),c)
o = as.data.frame(rbind(r,s,m))
t = cbind(rep(1,4),d)
u = cbind(rep(2,4),e)
v = cbind(rep(3,4),f)
p = as.data.frame(rbind(t,u,v))
#Splitting the data -------------------
list.1 = split(o, o$V1) #o$V1 is the index column
list.2 = split(p, p$V1) #o$V1 is the index column
现在每个列表都有3个元素,每个元素包含一个4x4矩阵。
我要寻找的结果的结构如下:
#Col1 #Col2 #Col3
sum(a*d) sum(b*d) sum(c*d)
sum(a*e) sum(b*e) sum(c*e)
sum(a*f) sum(b*f) sum(c*f)
对于我真正的问题,结果将是一个具有9000列和9000行的数组(矩阵或数据框)。
编辑:按照用户R的建议添加真实数据
useR建议我显示一些真实数据,以便人们知道我的真实数据的样子:
dput(list.1[1:3])
list(`1` = structure(list(vol = c(1425.76, 272.52, 0, 0, 31912.78,
6056.18, 8212.88, 3909.3, 0, 761.06, 22.45, 237.18), i_1 = c(2819.81,
4026.72, 827.2, 4790.52, 12218.1, 3632.64, 6308.66, 4076.71,
2192.98, 952.94, 112.84, 170.97), i_2 = c(2857.88, 2914.34, 761.87,
4412.4, 11046.36, 2363.24, 7761.31, 5431.03, 1337.62, 857, 103.46,
110.33), i_3 = c(1389.12, 932.86, 238.51, 5046, 5298.57, 3087.9,
8746.02, 7129.57, 708.53, 549.1, 86.58, 163.15), i_4 = c(1626.96,
936.04, 377.81, 4909.62, 6323.5, 2766.49, 3746.06, 2858.07, 900.29,
975.21, 102.76, 295.1), i_5 = c(1653.05, 1724.74, 321.59, 3937.2,
6966.48, 2614.67, 3326.99, 2371.44, 1082.43, 970.25, 123.51,
491.92), i_6 = c(1584.14, 3399.31, 392.24, 3957.88, 8042.5, 2614.46,
2371.67, 1896.1, 1201.83, 1314.06, 161.23, 892.91)), row.names = c(NA,
12L), class = "data.frame"), `2` = structure(list(vol = c(10774.34,
287.53, 0, 0, 57507.79, 10692.91, 9028.38, 10355.78, 8900.38,
3253.59, 22.45, 219), i_1 = c(5760.16, 4315.77, 585.28, 2886.11,
23767.55, 3095.39, 6705.94, 6445.96, 10612.49, 2470.32, 126.65,
143.46), i_2 = c(5035.23, 1785.77, 405.05, 4492.64, 21509.39,
3654.16, 10203.03, 9505.1, 6628.42, 1298.06, 111.76, 110.13),
i_3 = c(2798.54, 1920.72, 464.92, 7916.61, 13628.15, 8365.88,
18425.9, 22368.93, 2253.38, 1078.65, 101.34, 134.98), i_4 = c(2344.65,
1407.02, 369.7, 2889.69, 7618.72, 2110.01, 4982.27, 2250.94,
1744.4, 1033.89, 105.74, 212.26), i_5 = c(1653.05, 1724.74,
321.59, 3937.2, 6966.48, 2614.67, 3326.99, 2371.44, 1082.43,
970.25, 123.51, 491.92), i_6 = c(1584.14, 3399.31, 392.24,
3957.88, 8042.5, 2614.46, 2371.67, 1896.1, 1201.83, 1314.06,
161.23, 892.91)), row.names = 13:24, class = "data.frame"),
`3` = structure(list(vol = c(850.15, 218.58, 0, 0, 38959.27,
3081.31, 3441.35, 2760.54, 0, 2826.8, 0, 34.12), i_1 = c(6048.28,
3545.14, 1566.05, 2866.46, 20149.24, 1459.03, 2051.68, 2047.74,
5059.57, 3369.86, 129.37, 361.49), i_2 = c(1728.12, 1530.14,
364.37, 4761.97, 6934.24, 1802.96, 5394.15, 3972.96, 510.25,
989.3, 109.05, 322.37), i_3 = c(1182.59, 750.55, 311.19,
5540.96, 4544.96, 2535.07, 8926.35, 7209.61, 423.66, 446.62,
92.21, 282.36), i_4 = c(1179.1, 645.18, 283.97, 4616.66,
5063.41, 3110.14, 9240.41, 4752.86, 744.85, 648.44, 100.52,
311.67), i_5 = c(1653.05, 1724.74, 321.59, 3937.2, 6966.48,
2614.67, 3326.99, 2371.44, 1082.43, 970.25, 123.51, 491.92
), i_6 = c(1584.14, 3399.31, 392.24, 3957.88, 8042.5, 2614.46,
2371.67, 1896.1, 1201.83, 1314.06, 161.23, 892.91)), row.names = 25:36, class = "data.frame"))
dput(list.2[1:3])
list(`1` = structure(list(Index = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), itenm1 = c(998399, 998399, 998399, 998399, 998399,
998399, 998399, 998399, 998399, 998399, 998399, 998399 ), j_1 =
c(-261.62831, -605.82802, -190.35225, -802.27542, -835.07636,
-709.70814, -444.26492, -207.96871, -986.93606, -968.29324, -7675.97567,
-1271.43424), j_2 = c(0, -188.67302, 0, -799.17034, 0, 247.70379, 0, 0, 1051.71715, -27.94787, 0, -13.34628), j_3 = c(0, 0, 0, 0, 0,
-207.58736, 0, 0, -2333.43115, -1346.57579, 0, -205.13053 ), j_4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), j_5 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), j_6 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)),
row.names = c(NA, 12L), class = "data.frame"),
`2` = structure(list(Index = c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), itenm1 = c(998398, 998398, 998398, 998398,
998398, 998398, 998398, 998398, 998398, 998398, 998398, 998398
), j_1 = c(-106.64606, -203.78915, -76.30121, -310.10454,
-321.62536, -227.3462, -160.82221, -70.87354, -286.94001,
-137.28382, -3779.42484, -604.71574), j_2 = c(0, -96.94433,
0, -297.21757, 0, 67.67053, 0, 0, 309.38773, -8.42931, 0,
-6.7299), j_3 = c(0, 0, 0, 0, 0, -56.71107, 0, 0, -686.43453,
-406.13843, 0, -103.43761), j_4 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), j_5 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0), j_6 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = 13:24, class = "data.frame"),
`3` = structure(list(Index = c(3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), itenm1 = c(998397, 998397, 998397, 998397,
998397, 998397, 998397, 998397, 998397, 998397, 998397, 998397
), j_1 = c(-238.10472, -543.97414, -71.04739, -756.58841,
-782.7918, -667.84871, -424.38314, -193.82405, -638.12855,
-319.65804, -6693.88425, -1189.81911), j_2 = c(0, -182.11783,
0, -750.99738, 0, 233.80836, 0, 0, 683.61007, -18.48993,
0, -11.46144), j_3 = c(0, 0, 0, 0, 0, -195.94234, 0, 0, -1516.71678,
-890.87644, 0, -176.16082), j_4 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), j_5 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0), j_6 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = 25:36, class = "data.frame"))
答案 0 :(得分:1)
下面,我提出了三种方法及其基准:mapply
,嵌套的for
循环以及RcppArmadillo
。
数据:
#Data Generation-----------------------
set.seed(200)
a = matrix(sample(50,16), 4)
b = matrix(sample(50,16), 4)
c = matrix(sample(50,16), 4)
d = matrix(sample(50,16), 4)
e = matrix(sample(50,16), 4)
f = matrix(sample(50,16), 4)
#Creating index and binding------------
r = cbind(rep(1,4),a)
s = cbind(rep(2,4),b)
m = cbind(rep(3,4),c)
o = as.data.frame(rbind(r,s,m))
t = cbind(rep(1,4),d)
u = cbind(rep(2,4),e)
v = cbind(rep(3,4),f)
p = as.data.frame(rbind(t,u,v))
#Splitting the data -------------------
list.1 = split(o[,-1], o$V1) #o$V1 is the index column
list.2 = split(p[,-1], p$V1) #o$V1 is the index column
list.1 = lapply(list.1, as.matrix)
list.2 = lapply(list.2, as.matrix)
在这里,我通过先删除id列并将list.1
和list.2
的每个元素转换为矩阵来作弊。这样做可以提高我们函数的性能。
初始化功能:
# Rcpp -----------------------------------------------
library(Rcpp)
library(RcppArmadillo)
cppFunction(depends = "RcppArmadillo",
"arma::mat sumprod_Rcpp(List x, List y){
List xlist(x);
List ylist(y);
int n = xlist.size();
arma::mat m(n,n);
for(int i=0; i<n; i++) {
for(int j=0; j<n; j++){
arma::mat xMat = xlist[i];
arma::mat yMat = ylist[j];
arma::vec v = arma::vectorise(xMat*yMat);
m(j,i) = sum(v);
}
}
return(m);
}
"
)
# Nested For -----------------------------------------
sumprod_for <- function(x, y){
mat <- matrix(NA,length(list.1),length(list.1))
for(i in 1:length(list.1)){
for(j in 1:length(list.1)){
mat[j,i] <- sum(x[[i]] %*% y[[j]])
}
}
return(mat)
}
# Mapply ---------------------------------------------
sumprod_mapply <- function(x, y){
matrix(mapply(function(j, k){
sum(x[[j]] %*% y[[k]])
},
rep(1:length(list.1), each = length(list.1)),
rep(1:length(list.1), length(list.1))
),
length(list.1),
length(list.1)
)
}
# Ryan's sapply --------------------------------------
sumprod_sapply <- function(x, y){
sapply(x, function(j){
lapply(y, function(k) sum(j %*% k))
})
}
检查输出是否相同:
identical(sumprod_mapply(list.1, list.2), matrix(unlist(sumprod_sapply(list.1, list.2)), length(list.1), length(list.1)))
# [1] TRUE
identical(sumprod_mapply(list.1, list.2), sumprod_for(list.1, list.2))
# [1] TRUE
identical(sumprod_mapply(list.1, list.2), sumprod_Rcpp(list.1, list.2))
# [1] TRUE
sumprod_Rcpp(list.1, list.2)
# [,1] [,2] [,3]
# [1,] 44882 40505 49670
# [2,] 29750 26897 32260
# [3,] 45898 41248 50847
sumprod_for(list.1, list.2)
# [,1] [,2] [,3]
# [1,] 44882 40505 49670
# [2,] 29750 26897 32260
# [3,] 45898 41248 50847
sumprod_mapply(list.1, list.2)
# [,1] [,2] [,3]
# [1,] 44882 40505 49670
# [2,] 29750 26897 32260
# [3,] 45898 41248 50847
sumprod_sapply(list.1, list.2)
# 1 2 3
# 1 44882 40505 49670
# 2 29750 26897 32260
# 3 45898 41248 50847
基准化:
library(microbenchmark)
microbenchmark(sumprod_mapply(list.1, list.2),
sumprod_sapply(list.1, list.2),
sumprod_for(list.1, list.2),
sumprod_Rcpp(list.1, list.2),
times = 10000L)
结果:
Unit: microseconds
expr min lq mean median uq max neval
sumprod_mapply(list.1, list.2) 34.345 39.082 51.274501 41.846 62.373 2448.292 10000
sumprod_sapply(list.1, list.2) 37.108 42.635 56.119414 45.398 67.504 2324.733 10000
sumprod_for(list.1, list.2) 10.264 13.422 17.685540 15.001 22.502 120.008 10000
sumprod_Rcpp(list.1, list.2) 2.369 3.948 5.247494 4.738 6.317 88.032 10000
寻找性能提升时,Rcpp
的实现不会出错。但是令人惊讶的是,sumprod_for
比sumprod_mapply
和sumprod_sapply
快得多,这可能是由于mapply
隐式地将列表输出强制为向量。随时提出更多解决方案,我将它们添加到基准中。