我有一个2列矩阵,具有唯一的整数坐标和分数:
> data1<-data.matrix(data.frame("coord"=sample(1:100,50),"scores"=rnorm(25)))
> data1
coord scores
[1,] 22 -0.73799827
[2,] 76 -0.78022310
[3,] 62 0.45633095
[4,] 77 0.56617413
[5,] 60 -0.94876368
[6,] 83 -1.20792643
[7,] 85 -1.13890957
[8,] 78 0.63959763
[9,] 28 0.28039908
[10,] 68 -1.04277456
[11,] 27 0.48755194
[12,] 66 0.09612861
[13,] 69 -1.60932063
[14,] 6 -0.66797103
[15,] 10 -0.56594989
[16,] 50 -0.79548555
[17,] 39 1.13064066
[18,] 75 0.21617203
[19,] 34 -0.13480437
[20,] 54 -1.64825097
[21,] 48 -0.97955118
[22,] 58 0.55307028
[23,] 11 -0.99319227
[24,] 42 -0.58430293
[25,] 37 1.76576096
[26,] 67 -0.73799827
[27,] 65 -0.78022310
[28,] 47 0.45633095
[29,] 72 0.56617413
[30,] 97 -0.94876368
[31,] 57 -1.20792643
[32,] 38 -1.13890957
[33,] 16 0.63959763
[34,] 15 0.28039908
[35,] 86 -1.04277456
[36,] 33 0.48755194
[37,] 80 0.09612861
[38,] 2 -1.60932063
[39,] 93 -0.66797103
[40,] 73 -0.56594989
[41,] 40 -0.79548555
[42,] 26 1.13064066
[43,] 13 0.21617203
[44,] 96 -0.13480437
[45,] 41 -1.64825097
[46,] 59 -0.97955118
[47,] 46 0.55307028
[48,] 43 -0.99319227
[49,] 94 -0.58430293
[50,] 21 1.76576096
和唯一坐标向量:
> centers
[1] 39 31 61 16 48 82 42 76 71 43 93 35 6 100 67 81 70 79 45 17 96 78 69 95 29
我想创建一个矩阵,用于映射data1相对于中心的分数,其中每个中心是矩阵的中间,每行一个中心。换句话说,在矩阵中,我想看到每个“中心”附近有坐标的分数。我采取了以下方法:
> score_matrix<-matrix(nrow=length(centers),ncol=10)
> for(i in 1:length(centers)){
+ data2 <- data1
+ data2[,1] <- data2[,1] - centers[i] + ncol(score_matrix)/2
+ region_scores <- subset(data2,data2[,1] > 0 & data2[,1] <= ncol(score_matrix))
+ score_matrix[i,region_scores[,1]]<-region_scores[,2]
+ }
> print(score_matrix)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] -0.8688788 0.4524561 1.4594981 -1.0552725 -0.1594024 NA -0.4122056 NA NA NA
[2,] -1.0552725 1.5064965 NA -1.8956159 NA NA NA 0.7000265 NA NA
[3,] NA NA NA -0.7334736 NA NA -1.8381591 -1.8381591 -0.7334736 NA
[4,] NA NA -0.3910595 1.5064965 NA -0.1006090 0.1064373 0.4524561 NA NA
[5,] NA NA 0.8967748 NA NA NA NA 0.8458699 -0.1006090 NA
[6,] NA NA -1.8381591 -1.8381591 -0.7334736 NA NA NA NA NA
[7,] -1.3803871 -1.5606603 NA 0.8967748 -0.7036330 NA NA NA NA -1.6780760
[8,] NA NA NA NA 0.8458699 -0.1006090 NA -1.5606603 NA NA
[9,] NA NA NA -1.3673480 1.8448811 1.1304699 NA -0.8317189 0.1064373 -1.4426410
[10,] 0.8967748 -0.7036330 NA NA NA NA -1.6780760 -0.3910595 NA NA
[11,] 1.1304699 NA NA -1.0552725 1.5064965 NA -1.8956159 NA NA NA
[12,] NA NA -1.6780760 -0.7036330 NA NA 0.8967748 NA NA NA
[13,] NA NA NA -1.6780760 -0.7036330 NA NA 0.8967748 NA NA
[14,] NA 0.8458699 -0.1006090 NA -1.5606603 NA NA 0.8458699 -0.1594024 NA
[15,] -0.1006090 NA -1.5606603 NA NA 0.8458699 -0.1594024 NA NA -0.3910595
[16,] 1.8448811 1.1304699 NA -0.8317189 0.1064373 -1.4426410 NA 1.8448811 NA -1.4426410
[17,] NA NA -1.0552725 1.5064965 NA -1.8956159 NA NA NA 0.7000265
[18,] NA NA NA NA NA -1.3673480 1.8448811 1.1304699 NA -0.8317189
[19,] NA NA NA NA -0.7334736 NA NA -1.8381591 -1.8381591 -0.7334736
[20,] NA NA 0.7000265 NA NA NA -0.8688788 0.4524561 1.4594981 -1.0552725
[21,] NA -1.3803871 -1.5606603 NA 0.8967748 -0.7036330 NA NA NA NA
[22,] -0.7334736 NA NA NA NA NA -1.3673480 1.8448811 1.1304699 NA
[23,] NA NA NA NA -1.3673480 1.8448811 1.1304699 NA -0.8317189 0.1064373
[24,] NA 1.4594981 0.7000265 NA -1.3673480 -0.8688788 1.1304699 NA NA -1.0552725
[25,] -0.8317189 0.1064373 -1.4426410 NA 1.8448811 NA -1.4426410 -1.8956159 NA 1.4594981
但是,我应用它的数据集非常大,脚本需要大约24小时才能完成。有没有办法更有效地完成同样的事情?
谢谢,
丹
答案 0 :(得分:2)
我在Rcpp中实现你的功能:
data1 <-data.matrix(data.frame("coord"=sample(1:100,50),"scores"=rnorm(25)))
centers <- unique(data1[,1])
score_matrix<-matrix(nrow=length(centers),ncol=10)
for(i in 1:length(centers)){
data2 <- data1
data2[,1] <- data2[,1] - centers[i] + ncol(score_matrix)/2
region_scores <- subset(data2,data2[,1] > 0 & data2[,1] <= ncol(score_matrix))
score_matrix[i,region_scores[,1]]<-region_scores[,2]
}
library(inline)
library(Rcpp)
src <- '
NumericMatrix data1(Rdata1);
NumericVector centers(Rcenters);
NumericMatrix score_matrix(Rscore_matrix);
NumericVector data2(data1.nrow());
for(int i = 0;i < centers.size();i++) {
data2 = data1.column(0);
data2 = data2 - centers(i) + score_matrix.ncol() / 2;
for(int j = 0, k = 0;j < data2.size();j++) { // subset part
if (data2(j) <= 0)
continue;
if (data2(j) > score_matrix.ncol())
continue;
score_matrix(i, data2(j) - 1) = data1(j,1);
}
}
return score_matrix;
'
f <- cxxfunction(sig=c(Rdata1 = "numeric", Rcenters = "numeric", Rscore_matrix = "numeric"),
plugin="Rcpp", body=src)
score_matrix2<-matrix(nrow=length(centers),ncol=10)
score_matrix2 <- f(data1, centers, score_matrix2)
all.equal(score_matrix, score_matrix2)
library(rbenchmark)
benchmark({
score_matrix<-matrix(nrow=length(centers),ncol=10)
for(i in 1:length(centers)){
data2 <- data1
data2[,1] <- data2[,1] - centers[i] + ncol(score_matrix)/2
region_scores <- subset(data2,data2[,1] > 0 & data2[,1] <= ncol(score_matrix))
score_matrix[i,region_scores[,1]]<-region_scores[,2]
}
}, {
score_matrix2<-matrix(nrow=length(centers),ncol=10)
score_matrix2 <- f(data1, centers, score_matrix2)
})
在上次测试中,Rcpp比我的机器上的R快一倍。
如果速度不够快,您可能希望并行化算法。
尝试R包snow
并重新设计您的算法以获得除法数据和合并结果。
答案 1 :(得分:1)
R重新实施提供了47倍的加速。这是我原始代码的实现
f0 <- function(data1, centers) {
score_matrix <- matrix(nrow=length(centers), ncol=10)
for(i in seq_along(centers)) {
data2 <- data1
data2[,1] <- data2[,1] - centers[i] + ncol(score_matrix) / 2
idx <- data2[,1] > 0 & data2[,1] <= ncol(score_matrix)
region_scores <- data2[idx,]
score_matrix[i,region_scores[,1]] <- region_scores[,2]
}
score_matrix
}
我提取了常见的计算(centers[i] - ncol(score_matrix) / 2
和子集data1
)来获取
f1 <- function(data1, centers, ncol=10) {
score_matrix <- matrix(NA_real_, length(centers), ncol)
ccenters <- centers - ncol / 2
d1 <- data1[,1]
d2 <- data1[,2]
for (i in seq_along(ccenters)) {
score <- d1 - ccenters[i]
idx <- score > 0 & score <= ncol
score_matrix[i, score[idx]] <- d2[idx]
}
score_matrix
}
for循环应该编译好
library(compiler)
f1c <- cmpfun(f1)
和
library(rbenchmark)
data1 <- data.frame(coord=sample(100,50), scores=rnorm(25))
centers <- sort(scan(textConnection("39 31 61 16 48 82 42 76 71 43 93 35 6
100 67 81 70 79 45 17 96 78 69 95 29")))
给出
> identical(f0(data1, centers), f1(data1, centers))
[1] TRUE
> identical(f0(data1, centers), f1c(data1, centers))
[1] TRUE
> benchmark(f0(data1, centers), f1(data1, centers), f1c(data1, centers),
+ replications=10, columns=c("test", "elapsed", "relative"))
test elapsed relative
1 f0(data1, centers) 0.139 46.333
3 f1c(data1, centers) 0.003 1.000
2 f1(data1, centers) 0.005 1.667
原始问题中的目标似乎有些不完整 - 你打算用10,000 x 10矩阵做什么?