你好我试图运行一个程序,使用强制技术找到最接近的对,如pdf here Caching Performance Stanford
我的原始代码是:
float compare_points_BF(int N,point *P){
int i,j;
float distance=0, min_dist=FLT_MAX;
point *p1, *p2;
unsigned long long calc = 0;
for (i=0;i<(N-1);i++){
for (j=i+1;j<N;j++){
if ((distance = (P[i].x - P[j].x) * (P[i].x - P[j].x) +
(P[i].y - P[j].y) * (P[i].y - P[j].y)) < min_dist){
min_dist = distance;
p1 = &P[i];
p2 = &P[j];
}
}
}
return sqrt(min_dist);
}
该程序大致给出了这些运行时间:
N 8192 16384 32768 65536 131072 262144 524288 1048576
seconds 0,070 0,280 1,130 5,540 18,080 72,838 295,660 1220,576
0,080 0,330 1,280 5,190 20,290 80,880 326,460 1318,631
上述程序的缓存版本是:
float compare_points_BF(register int N, register int B, point *P){
register int i, j, ib, jb, num_blocks = (N + (B-1)) / B;
register point *p1, *p2;
register float distance=0, min_dist=FLT_MAX, regx, regy;
//break array data in N/B blocks, ib is index for i cached block and jb is index for j strided cached block
//each i block is compared with the j block, (which j block is always after the i block)
for (i = 0; i < num_blocks; i++){
for (j = i; j < num_blocks; j++){
//reads the moving frame block to compare with the i cached block
for (jb = j * B; jb < ( ((j+1)*B) < N ? ((j+1)*B) : N); jb++){
//avoid float comparisons that occur when i block = j block
//Register Allocated
regx = P[jb].x;
regy = P[jb].y;
for (i == j ? (ib = jb + 1) : (ib = i * B); ib < ( ((i+1)*B) < N ? ((i+1)*B) : N); ib++){
//calculate distance of current points
if((distance = (P[ib].x - regx) * (P[ib].x - regx) +
(P[ib].y - regy) * (P[ib].y - regy)) < min_dist){
min_dist = distance;
p1 = &P[ib];
p2 = &P[jb];
}
}
}
}
}
return sqrt(min_dist);
}
和一些结果:
Block_size = 256 N = 8192 Run time: 0.090 sec
Block_size = 512 N = 8192 Run time: 0.090 sec
Block_size = 1024 N = 8192 Run time: 0.090 sec
Block_size = 2048 N = 8192 Run time: 0.100 sec
Block_size = 4096 N = 8192 Run time: 0.090 sec
Block_size = 8192 N = 8192 Run time: 0.090 sec
Block_size = 256 N = 16384 Run time: 0.357 sec
Block_size = 512 N = 16384 Run time: 0.353 sec
Block_size = 1024 N = 16384 Run time: 0.360 sec
Block_size = 2048 N = 16384 Run time: 0.360 sec
Block_size = 4096 N = 16384 Run time: 0.370 sec
Block_size = 8192 N = 16384 Run time: 0.350 sec
Block_size = 16384 N = 16384 Run time: 0.350 sec
Block_size = 128 N = 32768 Run time: 1.420 sec
Block_size = 256 N = 32768 Run time: 1.420 sec
Block_size = 512 N = 32768 Run time: 1.390 sec
Block_size = 1024 N = 32768 Run time: 1.410 sec
Block_size = 2048 N = 32768 Run time: 1.430 sec
Block_size = 4096 N = 32768 Run time: 1.430 sec
Block_size = 8192 N = 32768 Run time: 1.400 sec
Block_size = 16384 N = 32768 Run time: 1.380 sec
Block_size = 256 N = 65536 Run time: 5.760 sec
Block_size = 512 N = 65536 Run time: 5.790 sec
Block_size = 1024 N = 65536 Run time: 5.720 sec
Block_size = 2048 N = 65536 Run time: 5.720 sec
Block_size = 4096 N = 65536 Run time: 5.720 sec
Block_size = 8192 N = 65536 Run time: 5.530 sec
Block_size = 16384 N = 65536 Run time: 5.550 sec
Block_size = 256 N = 131072 Run time: 22.750 sec
Block_size = 512 N = 131072 Run time: 23.130 sec
Block_size = 1024 N = 131072 Run time: 22.810 sec
Block_size = 2048 N = 131072 Run time: 22.690 sec
Block_size = 4096 N = 131072 Run time: 22.710 sec
Block_size = 8192 N = 131072 Run time: 21.970 sec
Block_size = 16384 N = 131072 Run time: 22.010 sec
Block_size = 256 N = 262144 Run time: 90.220 sec
Block_size = 512 N = 262144 Run time: 92.140 sec
Block_size = 1024 N = 262144 Run time: 91.181 sec
Block_size = 2048 N = 262144 Run time: 90.681 sec
Block_size = 4096 N = 262144 Run time: 90.760 sec
Block_size = 8192 N = 262144 Run time: 87.660 sec
Block_size = 16384 N = 262144 Run time: 87.760 sec
Block_size = 256 N = 524288 Run time: 361.151 sec
Block_size = 512 N = 524288 Run time: 379.521 sec
Block_size = 1024 N = 524288 Run time: 379.801 sec
从我们可以看到,运行时间比非缓存代码慢。 这是由于编译器优化吗?代码是不好还是只是因为算法在平铺时表现不佳?我用VS 2010编译了32位可执行文件。提前谢谢!
答案 0 :(得分:1)
Tiling可能是一个古老的概念,但它今天仍然非常重要。在原始代码中,对于每个i,您可以在仍然缓存的情况下重用大多数P [j]元素,但前提是内环的长度足够小以适应那里。实际大小应该由你想要用于平铺的缓存级别来确定 - L1将提供最快的性能,但是它也是最小的,你需要小块,并且平铺开销可能太多。 L2允许更大的瓷砖,但是性能会降低,等等。
请注意,这里不需要使用2d平铺,这不是矩阵乘法 - 您遍历同一个数组。您可以简单地平铺内部循环,因为它是溢出缓存的那个,一旦您完成了 - 外部循环(i)可以一直运行到当前缓存的内部循环元素块的末尾。在2d平铺中实际上没有任何意义,因为没有人会重复使用外部循环的元素(而不是矩阵mul)
因此,假设Point
为64位大,您可以在32k L1中安全地安装512个这样的阵列元素,或者在256k L2中安装4096个元素。如果i超出当前j块的范围,你将不得不在每个块上错过一次P [i],但这可以忽略不计。
顺便说一下 - 这个解释可能仍然是过时的,因为一个足够好的编译器可能会尝试为你做这一切。虽然这很复杂,所以我有点怀疑任何常见的甚至会尝试,但这里应该很容易证明重新排序是安全的。有人可能会争辩说“足够好的编译器”是一个悖论,但这不是主题......
答案 1 :(得分:1)
这是一个有趣的案例。编译器在两个内部循环中完成了循环不变提升的糟糕工作。也就是说,两个内部for循环在每次迭代中检查以下条件:
(j+1)*B) < N ? ((j+1)*B) : N
和
(i+1)*B) < N ? ((i+1)*B) : N
计算和分支都很昂贵;但它们实际上是两个内部for循环的循环不变量。一旦手动将它们从两个内部for循环中提升出来,我就可以使缓存优化版本比未优化版本更好(当N == 524288时为10%,当N = 1048576时为30%)。
这是修改过的代码(真的很简单,查找u1,u2):
//break array data in N/B blocks, ib is index for i cached block and jb is index for j strided cached block
//each i block is compared with the j block, (which j block is always after the i block)
for (i = 0; i < num_blocks; i++){
for (j = i; j < num_blocks; j++){
int u1 = (((j+1)*B) < N ? ((j+1)*B) : N);
int u2 = (((i+1)*B) < N ? ((i+1)*B) : N);
//reads the moving frame block to compare with the i cached block
for (jb = j * B; jb < u1 ; jb++){
//avoid float comparisons that occur when i block = j block
//Register Allocated
regx = P[jb].x;
regy = P[jb].y;
for (i == j ? (ib = jb + 1) : (ib = i * B); ib < u2; ib++){
//calculate distance of current points
if((distance = (P[ib].x - regx) * (P[ib].x - regx) +
(P[ib].y - regy) * (P[ib].y - regy)) < min_dist){
min_dist = distance;
p1 = &P[ib];
p2 = &P[jb];
}
}
}
}
}