Question

我实现了一个递归扫描（前缀和）算法，我在下面包含了这个算法。在这里，我简单地生成大小功率为2的随机列表，直到第27次幂，检查简单的顺序扫描是否准确。它有效。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <mkl.h>

int *pscan(int *x, int n, int z, int chunk_size);
int reduce(int *x, int n);

int main(int argc, char **argv)
{
  int n;
  int i, j, k;
  int *x, *seq, *r;
  double begin, end;

  srand48(time(0));

  /* Randomly generate array of size n. */

  for (k = 2; k < 28; k++) {
    n = (int) pow(2, k);

    seq = (int *) malloc(sizeof(int) * n);
    x = (int *) malloc(sizeof(int) * n);

    for (i = 0; i < n; i++) {
      x[i] = lrand48() % 100 - 50;
      seq[i] = x[i];
    }

    /* Parallel scan. */

    begin = dsecnd();

    r = pscan(x, n, 0, 2);

    end = dsecnd();

    printf("%d %lf\n", n, end - begin);

    /* Sequential check. */

    for (i = 1; i < n; i++) {
      seq[i] = seq[i - 1] + seq[i];
    }

    for (i = 0; i < n; i++) {
      if (r[i] != seq[i]) {
        fprintf(stderr, "AGGGHHH!!! ERROR.  Found with vector: \n");
        for (j = 0; j < n; j++) {
          printf("%d ", x[i]);
        }
        printf("\n");
        exit(1);
      }
    }
    free(r);
    free(x);
    free(seq);
  }

  return 0;
}

/* Perform parallel scan. */
int *pscan(int *x, int n, int z, int chunk_size)
{
  int i, j;
  int *sums, *sumscan, *scan, **fsum, *rv;

  /* Base case, serially scan a chunk. */
  if (n <= chunk_size) {
    scan = (int *) malloc(sizeof(int) * n);

    scan[0] = x[0] + z;
    for (i = 1; i < n; i++) {
      scan[i] = x[i] + scan[i - 1];
    }

    return scan;
  }

  sums = (int *) malloc(sizeof(int) * (n / chunk_size));

  /* Reduce each chunk of the array. */
  for (i = 0; i < n / chunk_size; i++) {
    sums[i] = reduce(&x[i * chunk_size], chunk_size);
  }

  /* Perform a scan on the sums. */
  sumscan = pscan(sums, n / chunk_size, 0, chunk_size);

  free(sums);

  fsum = (int **) malloc(sizeof(int *) * (n / chunk_size));

  /* Perform a recursive scan on each chunk, using
     the appropriate offset from the sums scan. */
  for (i = 0; i < n / chunk_size; i++) {
    if (i > 0) {
      fsum[i] = pscan(&x[i * chunk_size], chunk_size, sumscan[i - 1], chunk_size);
    } else {
      fsum[i] = pscan(&x[i * chunk_size], chunk_size, 0, chunk_size);
    }
  }

  free(sumscan);

  rv = (int *) malloc(sizeof(int) * n);

  /* Join the arrays. */
  for (i = 0; i < n / chunk_size; i++) {
    for (j = 0; j < chunk_size; j++) {
      rv[i * chunk_size + j] = fsum[i][j];
    }
  }

  for (i = 0; i < n / chunk_size; i++) {
    free(fsum[i]);
  }

  free(fsum);

  return rv;
}

/* Serial reduction. */
int reduce(int *x, int n)
{
  int i;
  int sum;

  sum = 0;

  for (i = 0; i < n; i++) {
    sum += x[i];
  }

  return sum;
}

现在，我想将它并行化。因为我感觉有点时髦，我已经破解了Cilk的实现。我只是将两个主要for循环替换为并行化1）减少和2）每个块的递归扫描，使用块减少的适当扫描作为偏移。看起来就是这样。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <cilk/cilk.h>
#include <mkl.h>

int *pscan(int *x, int n, int z, int chunk_size);
int reduce(int *x, int n);

int main(int argc, char **argv)
{
  int n;
  int i, j, k;
  int *x, *seq, *r;
  double begin, end;

  srand48(time(0));

  /* Randomly generate array of size n. */

  for (k = 2; k < 28; k++) {
    n = (int) pow(2, k);

    seq = (int *) malloc(sizeof(int) * n);
    x = (int *) malloc(sizeof(int) * n);

    for (i = 0; i < n; i++) {
      x[i] = lrand48() % 100 - 50;
      seq[i] = x[i];
    }

    /* Parallel scan. */

    begin = dsecnd();

    r = pscan(x, n, 0, 2);

    end = dsecnd();

    printf("%d %lf\n", n, end - begin);

    /* Sequential check. */

    for (i = 1; i < n; i++) {
      seq[i] = seq[i - 1] + seq[i];
    }

    for (i = 0; i < n; i++) {
      if (r[i] != seq[i]) {
        fprintf(stderr, "AGGGHHH!!! ERROR.  Found with vector: \n");
        for (j = 0; j < n; j++) {
          printf("%d ", x[i]);
        }
        printf("\n");
        exit(1);
      }
    }
    free(r);
    free(x);
    free(seq);
  }

  return 0;
}

/* Perform parallel scan. */
int *pscan(int *x, int n, int z, int chunk_size)
{
  int i, j;
  int *sums, *sumscan, *scan, **fsum, *rv;

  /* Base case, serially scan a chunk. */
  if (n <= chunk_size) {
    scan = (int *) malloc(sizeof(int) * n);

    scan[0] = x[0] + z;
    for (i = 1; i < n; i++) {
      scan[i] = x[i] + scan[i - 1];
    }

    return scan;
  }

  sums = (int *) malloc(sizeof(int) * (n / chunk_size));

  /* Reduce each chunk of the array. */
  cilk_for (i = 0; i < n / chunk_size; i++) {
    sums[i] = reduce(&x[i * chunk_size], chunk_size);
  }

  /* Perform a scan on the sums. */
  sumscan = pscan(sums, n / chunk_size, 0, chunk_size);

  free(sums);

  fsum = (int **) malloc(sizeof(int *) * (n / chunk_size));

  /* Perform a recursive scan on each chunk, using
     the appropriate offset from the sums scan. */
  cilk_for (i = 0; i < n / chunk_size; i++) {
    if (i > 0) {
      fsum[i] = pscan(&x[i * chunk_size], chunk_size, sumscan[i - 1], chunk_size);
    } else {
      fsum[i] = pscan(&x[i * chunk_size], chunk_size, 0, chunk_size);
    }
  }

  free(sumscan);

  rv = (int *) malloc(sizeof(int) * n);

  /* Join the arrays. */
  for (i = 0; i < n / chunk_size; i++) {
    for (j = 0; j < chunk_size; j++) {
      rv[i * chunk_size + j] = fsum[i][j];
    }
  }

  for (i = 0; i < n / chunk_size; i++) {
    free(fsum[i]);
  }

  free(fsum);

  return rv;
}

/* Serial reduction. */
int reduce(int *x, int n)
{
  int i;
  int sum;

  sum = 0;

  for (i = 0; i < n; i++) {
    sum += x[i];
  }

  return sum;
}

它有效！好吧，它返回正确的结果。它没有达到我所希望的性能。最初的表现是

4 0.000004
8 0.000001
16 0.000002
32 0.000003
64 0.000005
128 0.000011
256 0.000019
512 0.000035
1024 0.000068
2048 0.000130
4096 0.000257
8192 0.000512
16384 0.001129
32768 0.002262
65536 0.004519
131072 0.009065
262144 0.018297
524288 0.037416
1048576 0.078307
2097152 0.157448
4194304 0.313855
8388608 0.625689
16777216 1.251949
33554432 2.589439
67108864 5.084731
134217728 10.402186

用于单线程应用程序，但Cilk版本执行更糟，具有以下运行时

4 0.005383
8 0.000011
16 0.000009
32 0.000111
64 0.000055
128 0.000579
256 0.000339
512 0.000544
1024 0.000701
2048 0.001086
4096 0.001265
8192 0.001742
16384 0.002283
32768 0.003891
65536 0.005398
131072 0.009255
262144 0.020736
524288 0.058156
1048576 0.103893
2097152 0.215460
4194304 0.419988
8388608 0.749368
16777216 1.650938
33554432 2.960451
67108864 5.799836
134217728 11.294398

我有一台24核机器，所以我们显然没有看到我们希望在这里加速的速度。我的第一个想法是Cilk错误地处理递归，导致超额订阅，但Cilk特别应该很好地处理递归。关于如何正确实现这一点的任何提示？我尝试将cilk_for添加到底部for循环（释放所有内容）和倒数第二组循环的内部for循环（加入数组），但这进一步降低了性能。

任何建议都很受欢迎。

但是，请不要告诉我切换到所讨论的Belloch的并行扫描算法here。我已经在Cilk中实现了它，并且它运行得很好。我想看看我是否可以将其性能与这种递归解决方案相匹配。

Answer 1

我通过找到每个问题的最佳块大小来修复我的性能问题。在该块大小，（相同）并行版本比顺序版本执行得更好。

总之，我的一般方法，特别是两个块大小，有一些问题：

我的基准测试方法。在具有调整参数的代码中，使用调整参数的相同值绘制运行时与问题大小没有多大意义，因为最佳值取决于问题大小。
大小为2的块可能存在问题，因为虽然它最大化了并行性，但它也最大化了递归级别的数量，同样也增加了随之而来的开销。
大小为2的块会阻止矢量化。
正如Leeor建议的那样，大小为2的块可能也会导致缓存中的错误共享。

向Leeor道具，引导我朝着正确的方向前进。

如何使用递归扫描算法实现Cilk并行？

1 个答案: