我有一个dim n x 140000的矩阵(名为rating)和另一个dim nxn的矩阵(名为trust),其中n在我改变组时会发生变化,n可能有1-15000的值。我需要通过信任乘以每列评级。例如:
trust= rating=
a1 a2 a3 a4 a5 1 2 3 4 5 6 7 8
b1 b2 b3 b4 b5 2 5 7 8 9 2 1 6
c1 c2 c3 c4 c5 3 5 3 6 8 1 2 5
d1 d2 d3 d4 d5 4 7 8 2 4 5 6 7
e1 e2 e3 e4 e5 5 2 5 7 8 9 1 4
answer1= answer2=
a1.1 a2.2 a3.3 a4.4 a5.5 a1.2 a2.5 a3.5 a4.7 a5.2
b1.1 b2.2 b3.3 b4.4 b5.5 b1.2 b2.5 b3.5 b4.7 b5.2
c1.1 c2.2 c3.3 c4.4 c5.5 c1.2 c2.5 c3.5 c4.7 c5.2
d1.1 d2.2 d3.3 d4.4 d5.5 d1.2 d2.5 d3.5 d4.7 d5.2
e1.1 e2.2 e3.3 e4.4 e5.5 e1.2 e2.5 e3.5 e4.7 e5.2
和answer3必须乘以第3列,依此类推。然后添加answer1,answer2,...的每一行并存储到向量中。然后将每个向量存储到列表中以供将来使用。
for (k in 1:ncol(rating)) {
clmy <- as.matrix(rating[, k])
answer <- sweep(trust, MARGIN = 2, clmy, '*')
sumtrustbyrating <- rowSums(answer)
LstsumRbyT[[k]] <- sumtrustbyrating
sumtrustbyrating = NULL
}
如果我将ncol(rating)
更改为较小的值(约100),它的工作正常。但对于实际数据,我有140000列。这需要时间,我无法获得最终的执行结果。请帮助我提高代码的性能,以获取庞大的数据集。
答案 0 :(得分:2)
矩阵产品怎么样?或者这太慢了?
rating <- matrix(c(1, 2, 3, 4, 5,2, 5, 5, 6, 3, 3, 4, 1, 2, 1), ncol=3)
trust <- matrix(rep(1:5, rep(5, 1)), 5, byrow=TRUE)
运行上面的代码会产生
LstsumRbyT
[[1]]
[1] 55 55 55 55 55
[[2]]
[1] 66 66 66 66 66
[[3]]
[1] 27 27 27 27 27
与
相同 trust %*% rating
[,1] [,2] [,3]
[1,] 55 66 27
[2,] 55 66 27
[3,] 55 66 27
[4,] 55 66 27
[5,] 55 66 27
如果这还不够,那我可以在RCppArmadillo中稍微改进一下。
添加到基准测试讨论中。如果上面的for循环重命名为f()
,那么我得到
microbenchmark(trust %*% rating, f())
Unit: microseconds
expr min lq mean median uq max neval cld
trust %*% rating 1.418 1.7010 2.97663 2.7215 3.5965 14.452 100 a
f() 593.890 700.9775 764.00515 766.5535 792.6375 1511.104 100 b
这是普通矩阵产品的相当大的加速。
答案 1 :(得分:1)
我会将所有内容矢量化:
library(data.table)
set.seed(666)#in order to have reproducible results
n<-10#number of cols and rows
(trust<-matrix(runif(n*n),ncol=n,nrow=n))
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 0.77436849 0.77589308 0.98422408 0.4697785 0.2444375 0.06913359 0.7748744 0.60379428 0.7659585 0.13247078
[2,] 0.19722419 0.01637905 0.60134555 0.3976166 0.5309707 0.08462063 0.8120639 0.32826395 0.7758464 0.07851311
[3,] 0.97801384 0.09574478 0.03834435 0.8046367 0.1183959 0.12994557 0.2606025 0.66611781 0.3125150 0.37822385
[4,] 0.20132735 0.14216354 0.14149569 0.5088974 0.9833834 0.74613202 0.6515950 0.87478750 0.8422173 0.57962476
[5,] 0.36124443 0.21112624 0.80638553 0.6349154 0.8977528 0.03887918 0.9238039 0.06887527 0.3141499 0.53642512
[6,] 0.74261194 0.81125644 0.26668568 0.4942517 0.7385738 0.68563542 0.2661061 0.79346301 0.7565639 0.10853192
[7,] 0.97872844 0.03654720 0.04270205 0.2801309 0.3773107 0.14397736 0.2661330 0.57142701 0.9675244 0.74031515
[8,] 0.49811371 0.89163741 0.61217452 0.9087104 0.6061688 0.89107996 0.9109179 0.04894407 0.1694229 0.45178964
[9,] 0.01331584 0.48323641 0.55334840 0.7841162 0.5121943 0.08963612 0.5905635 0.98035135 0.6968752 0.64610821
[10,] 0.25994613 0.46666453 0.85350077 0.5589970 0.9892467 0.03773272 0.9181476 0.91453735 0.8726508 0.74929873
(rating<-matrix(sample(n*n),ncol=n,nrow=n))
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 58 19 13 25 23 96 38 100 47 93
[2,] 37 22 45 41 4 18 52 83 89 39
[3,] 87 36 15 40 94 11 31 63 35 10
[4,] 59 88 81 64 68 27 92 56 49 46
[5,] 24 90 8 44 43 82 14 57 79 66
[6,] 95 74 48 70 7 33 34 42 60 50
[7,] 26 65 73 61 32 12 97 98 9 69
[8,] 21 86 1 99 6 72 75 20 71 62
[9,] 29 85 55 30 53 80 77 2 28 51
[10,] 67 91 76 16 5 3 84 54 78 17
功能:
prod1<-function(m1,m2){
res<-NULL
if(dim(m1)[1]==dim(m2)[1])
res<-rbindlist(data.table(rbindlist(data.table(lapply(seq_along(1:nrow(m2)),function(y) {lapply(seq_along(1:nrow(m1)[1]),function(x){m1[,x]*m2[y,x]})})))$V1))
return(res)
}
将产生:(answer1<-prod1(trust,rating))#sequence of arguments DOES matter
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
1: 44.9133724 14.7419685 12.7949130 11.744463 5.622062 6.636824 29.445226 60.379428 36.000049 12.319782
2: 11.4390031 0.3112020 7.8174921 9.940414 12.212325 8.123580 30.858427 32.826395 36.464780 7.301719
3: 56.7248030 1.8191509 0.4984765 20.115918 2.723107 12.474775 9.902897 66.611781 14.688207 35.174818
4: 11.6769863 2.7011073 1.8394440 12.722435 22.617819 71.628674 24.760610 87.478750 39.584213 53.905103
5: 20.9521768 4.0113985 10.4830118 15.872884 20.648315 3.732401 35.104546 6.887527 14.765046 49.887537
6: 43.0714926 15.4138724 3.4669138 12.356293 16.987197 65.821000 10.112033 79.346301 35.558503 10.093469
7: 56.7662495 0.6943967 0.5551267 7.003272 8.678146 13.821827 10.113054 57.142701 45.473646 68.849309
8: 28.8905951 16.9411108 7.9582688 22.717759 13.941883 85.543676 34.614880 4.894407 7.962877 42.016436
9: 0.7723185 9.1814918 7.1935292 19.602904 11.780468 8.605067 22.441414 98.035135 32.753133 60.088064
10: 15.0768755 8.8666260 11.0955099 13.974926 22.752673 3.622341 34.889611 91.453735 41.014587 69.684782
最后,answer2
通过函数
prod2<-function(m1,m2){
res<-NULL
if(dim(m1)[1]==dim(m2)[1])
res<-rbindlist(data.table(rbindlist(data.table(lapply(seq_along(2:nrow(m2)),function(y) {lapply(seq_along(2:nrow(m1)[1]),function(x){m1[,x]*m2[y,x+1]})})))$V1))
return(res)
}
,特别是answer2<-prod2(trust,rating)
,产生:
V1 V2 V3 V4 V5 V6 V7 V8 V9
1: 14.7130013 10.0866100 24.6056020 10.804906 23.46600 2.627076 77.48744 28.378331 71.23414
2: 3.7472596 0.2129277 15.0336387 9.145181 50.97318 3.215584 81.20639 15.428406 72.15371
3: 18.5822630 1.2446822 0.9586087 18.506645 11.36601 4.937932 26.06025 31.307537 29.06390
4: 3.8252197 1.8481260 3.5373923 11.704640 94.40481 28.353017 65.15950 41.115012 78.32621
5: 6.8636441 2.7446411 20.1596381 14.603053 86.18427 1.477409 92.38039 3.237138 29.21594
6: 14.1096269 10.5463338 6.6671419 11.367790 70.90308 26.054146 26.61061 37.292761 70.36044
7: 18.5958403 0.4751135 1.0675513 6.443011 36.22183 5.471140 26.61330 26.857069 89.97977
8: 9.4641605 11.5912864 15.3043631 20.900338 58.19221 33.861038 91.09179 2.300371 15.75633
9: 0.2530009 6.2820733 13.8337100 18.034672 49.17065 3.406172 59.05635 46.076514 64.80939
10: 4.9389764 6.0666389 21.3375191 12.856932 94.96768 1.433843 91.81476 42.983255 81.15652
library(microbenchmark)
library("ggplot2")
set.seed(666)
global_func<-function(n){
trust<-matrix(runif(n*n),ncol=n,nrow=n)
rating<-matrix(sample(n*n),ncol=n,nrow=n)
prod1<-function(m1,m2){
res<-NULL
if(dim(m1)[1]==dim(m2)[1])
res<-rbindlist(data.table(rbindlist(data.table(lapply(seq_along(1:nrow(m2)),function(y) {lapply(seq_along(1:nrow(m1)[1]),function(x){m1[,x]*m2[y,x]})})))$V1))
return(res)
}
prod2<-function(m1,m2){
res<-NULL
if(dim(m1)[1]==dim(m2)[1])
res<-rbindlist(data.table(rbindlist(data.table(lapply(seq_along(2:nrow(m2)),function(y) {lapply(seq_along(2:nrow(m1)[1]),function(x){m1[,x]*m2[y,x+1]})})))$V1))
return(res)
}
return(list(prod1(trust,rating),prod2(trust,rating)))
}
让我们比较时间与列数/行数(n
)---谨慎使用
tm<-microbenchmark(global_func(10),
global_func(50),
global_func(100),
global_func(500),
times = 100
)
autoplot(tm)