Question

我有一段代码可以从“double ** times”计算一个值。假设“times”的维度为[nsims] [N]（使用malloc ...创建），其中int N = 40且int nsims = 50000。

结果存储在“double ** moments”中。所以我们有5个嵌套的for循环。

然而问题是速度，因为这段代码需要运行大约100万次。

我已经在使用线程（此处未显示）将最内层的for循环拆分为10个并行线程，这已经节省了大量时间。

有没有人看到其他优化可能性，特别是关于不同的数据结构或类似的东西？

即使我没有“interm = ...”公式，它仍然需要花费太多时间。

for(j=2;j<=N;j++) {     
    for(k=j;k<=N;k++) {
        moment=0;
        for(i=2;i<=N;i++) {
            for(l=i;l<=N;l++) {
                if(strcmp(mmethod, "emp")==0) {
                    for(a=0;a<nsims;a++) {
                        interm=interm + (double) times[a][k] *
                                        times[a][j]*times[a][i] *
                                        times[a][l];    
                    }
                    interm = (double) interm/nsims;
                    moment = moment + (interm*i*l);
                    interm=0;
                }
            }
        }
        if(!(changed_times[k]==0
             && changed_times[j]==0
             && changed_times[l]==0
             && changed_times[i]==0))
        {
            moments[0][pcount]=(double) moment;
        } else {
            moments[0][pcount]=moments[0][pcount];
        }
        pcount++;
    }
}

Answer 1

请注意，在内循环中，每次查找并乘以times[a][k]*times[a][j]*times[a][i]，即使a的每个值的表达式相同。对于乘法和内存查找，它可能很昂贵。（也许编译器很聪明，可以优化它，我不知道。）你可能会尝试在内循环中缓存这些值，如下所示：

  ...
  double akji[nsims];
  for (a = 0; a < nsims; ++a) { akji[a] = times[a][k]*times[a][j]*times[a][i]; }
  for(l=i;l<=N;l++) {
interm=0;
for(a=0;a<nsims;a++) {
  interm += akji[a]*times[a][l]; 
}
moment += (interm*l);
  }
  moment = moment * i / nsims;
  ...

Answer 2

我想应该从更高层次的问题描述开始。

但作为次要选项，我建议交换数组索引，以便更容易编写一个速度极快的SSE内部循环，它结合了四个（可能是不同的）向量：

 double times[N+1][nsims], *tend = times[N+1];
 double *j,*k,*i,*l;
 for (j=times[2];j<tend;j+=nsims)
  for (k=j;k<tend;k+=nsims)
   if (strcmp( )) ... /* One _can_ move this elsewhere, but why bother? */
   for (i=times[2];i<tend;i+=nsims)
    for (l=i;l<tend;l+=nsims) {
      interm = efficient_sse_implementation(j,k,i,l, nsims);
      ...
    }

通过为少于4个不同阵列的情况编写不同的内核，也可以实现微小优化。（在这种情况下，可以跳过每步的一次内存操作。）

修改

在这种情况下，模式for(j=2;j<=N;j++) for (k=j;k<=N;k++)的结构重复两次，而这仅仅意味着更高级别优化的可能性 - 执行的操作是什么？虽然在努力，但这种模式仍然提出了另一种方法：缓存780（左右）子产品，但同时执行循环阻塞。这种方法不应该与我对先生所评论的问题相同。 gcbenison。

for (A=0;A<50000;A+=100) { int k=0; for (i=2;i<=N;i++) for (j=i;j<=N;j++,k++) for (a=0;a<100;a++) precalc[k][a]=times[i][A+a]*times[j][A+a]; for (i=0;i<k;i++) // Now i loops from 0..779 or so for (j=0;j<k;j++) { for (a=0;a<100;a++) partial_product+=precalc[i][a]*precalc[j][a]; // accumulate also partial_product to moment } }

免责声明：此未经验证，但存在一些最佳的块大小（不一定是100）（并且它可能比之前的更糟糕）。另请注意，此方法为预先计算的表使用了大量内存。（选择100块的块大小需要624000字节的内存，听起来相当不错。要低于256k，块长度只能是42）。

编辑2 ：

//请注意，EDIT_1中的循环计算P[2][a]*P[3][a]和P[3][a]*P[2][a]。

for (i=0;i<k;i++) // Now i loops from 0..779 or so, but... we can limit the for (j=i;j<k;j++) { // calculation to the upper triangle of the 780^2 matrix for (a=0;a<100;a++) partial_product+=precalc[i][a]*precalc[j][a]; moment[i]+=partial_product; moment[lower_triangle(i)]+=partial_product; // <-- 50% speed increase }

编辑3：这里有一些尝试：

gcc -O4 -DCACHELEVEL=2 -DPOPULATE=1 -DARRAY_OPT=1 && time ./a.out

POPULATE初始化数组（假设非零内容很重要）

ARRAY_OPT=1将数组索引切换为（可能）更好的顺序

CACHELEVEL=2或3缓存中间结果的缓存。
可以在源代码中找到
STRCMP来测试memcmp与strcmp对比'1'

NOT TODO 1 ：带缓存值的LOOP_BLOCKING - 降低性能
TODO 2 ：仅限上三角计算 TODO 3 ：了解changed_times[n]和moments[0][p]的含义 - 现在它很突出，没有一个计算被保存！

#include <stddef.h> #define N 40 #define nsims 8000 #if ARRAY_OPT #define TIMES(n,a) times[n][a] double times[N+1][nsims]; // [nsims]; #else #define TIMES(n,a) times[a][n] double times[nsims][N+1]; #endif #define STRCMP 1 // vs. // #define STRCMP1 strcmp(mmethod, "emp")==0 void init() { #ifdef POPULATE int i,a; for (i=0;i<=N;i++) for (a=0;a<nsims;a++) TIMES(i,a) = (double)((i^a)&7) - 3.5; #endif } double moments[4000] = { 0 }; double cache1[nsims]; double cache2[nsims]; int main() { int j,k,i,l,a, pcount=0; init(); int changed_times[N+1]={0}; char *mmethod="emp"; double moment,interm; for(j=2;j<=N;j++) { for(k=j;k<=N;k++) { #if CACHELEVEL == 2 for (a=0;a<nsims;a++) cache1[a]=TIMES(j,a)*TIMES(k,a); #endif moment=0; for(i=2;i<=N;i++) { #if CACHELEVEL == 3 for (a=0;a<nsims;a++) cache2[a]=TIMES(j,a)*TIMES(k,a)*TIMES(i,a); #else for (a=0;a<nsims;a++) cache2[a]=cache1[a]*TIMES(i,a); #endif for(l=i;l<=N;l++) { if(STRCMP) { for(a=0;a<nsims;a++) { #if CACHELEVEL >= 2 interm += (double) cache2[a]*TIMES(l,a); #else interm=interm + (double) TIMES(k,a) * TIMES(j,a) * TIMES(i,a) * TIMES(l,a); #endif } interm = (double) interm/(double)nsims; moment = moment + (interm*i*l); interm=0; } } } //if(!(changed_times[k]==0 // && changed_times[j]==0 // && changed_times[l]==0 // && changed_times[i]==0)) //{ // moments[0][pcount]=(double) moment; // changed_times[k]++;changed_times[j]++; /* or what? */ // changed_times[l]++;changed_times[i]++; //} else { // moments[0][pcount]=moments[0][pcount]; //} pcount++; } } printf("%d %f\n",pcount, moment); }

Answer 3

第一个明显的优化是将strcmp()移出循环。

字符串比较可能需要相当长的时间（实际上并不多，但重复此调用这么多次会产生很大的不同）。此外，此调用可能永远不会被编译器优化，而其结果在整个处理过程中是恒定的。因此，在进入嵌套循环之前将结果存储在临时布尔变量中，并仅测试循环内的布尔值。

也一如既往地在尝试优化一段代码时，确保使用发布目标进行编译（无需调试信息）并打开所有可能的编译器优化。

5嵌套for循环，速度优化

3 个答案: