Question

更新了，原帖在线下方：

我需要计算中位数，并希望使用O（N）quickselect算法。然而事实证明，当数组不再是平面数组的双精度数，而是结构数组（其中一个元素是用于中值计算的元素）时，运行时间不再与O（N）成比例。

以下平面阵列版本具有近似线性运行时：

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define SWAP(a,b) temp=(a);(a)=(b);(b)=temp;

double quickselect(unsigned long k, unsigned long n, double *arr)
{
  unsigned long i, ir, j, l, mid;
  double a, temp;

  l=1;
  ir=n-1;
  for (;;) {
    if (ir <= l+1) {
      if (ir == l+1 && arr[ir] < arr[l]) {
        SWAP(arr[l],arr[ir])
      }
      return arr[k];
    } else {
      mid=(l+ir) >> 1;
      SWAP(arr[mid],arr[l+1])
      if (arr[l] > arr[ir]) {
        SWAP(arr[l],arr[ir])
      }
      if (arr[l+1] > arr[ir]) {
        SWAP(arr[l+1],arr[ir])
      }
      if (arr[l] > arr[l+1]) {
        SWAP(arr[l],arr[l+1])
      }
      i=l+1;
      j=ir;
      a=arr[l+1];
      for (;;) {
        do i++; while (arr[i] < a);
        do j--; while (arr[j] > a);
        if (j < i) break;
        SWAP(arr[i],arr[j])
      }
      arr[l+1]=arr[j];
      arr[j]=a;
      if (j >= k) ir=j-1;
      if (j <= k) l=i;
    }
  }
}

int main()
{
  unsigned long i, j, k, l, m;
  unsigned long ntest = 1e2;
  unsigned long N[5] = {1e3, 1e4, 1e5, 1e6, 1e7};
  clock_t start, diff;

  int seed = 215342512; //time(NULL);
  srand(seed);

  double *arr = (double*) malloc(N[4] * sizeof(double));

  for (i=0; i<5; i++)
  {
    start = clock();

    for (j=0; j<ntest; j++)
    {
      for (k=0; k<N[i]; k++)
      {
        arr[k] = (double) rand() / (double) RAND_MAX;
      }

      quickselect(N[i] / 2, N[i], arr);
    }

    diff = clock() - start;

    printf("%lu %.5f\n", N[i], (double) diff / CLOCKS_PER_SEC);
  }
}

给出：

1000 0.00228
10000 0.02014
100000 0.19868
1000000 2.01272
10000000 20.41286

但是，以下带有结构的版本具有非线性运行时：

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define SWAP(a,b) temp=(a);(a)=(b);(b)=temp;

typedef struct {
  double x;
  double y;
  double z;
  int id;
} point_t;

point_t* quickselect(unsigned long k, unsigned long n, point_t **arr)
{
  unsigned long i, ir, j, l, mid;
  point_t *a, *temp;

  l=1;
  ir=n-1;
  for (;;) {
    if (ir <= l+1) {
      if (ir == l+1 && arr[ir]->x < arr[l]->x) {
        SWAP(arr[l],arr[ir])
      }
      return arr[k];
    } else {
      mid=(l+ir) >> 1;
      SWAP(arr[mid],arr[l+1])
      if (arr[l]->x > arr[ir]->x) {
        SWAP(arr[l],arr[ir])
      }
      if (arr[l+1]->x > arr[ir]->x) {
        SWAP(arr[l+1],arr[ir])
      }
      if (arr[l]->x > arr[l+1]->x) {
        SWAP(arr[l],arr[l+1])
      }
      i=l+1;
      j=ir;
      a=arr[l+1];
      for (;;) {
        do i++; while (arr[i]->x < a->x);
        do j--; while (arr[j]->x > a->x);
        if (j < i) break;
        SWAP(arr[i],arr[j])
      }
      arr[l+1]=arr[j];
      arr[j]=a;
      if (j >= k) ir=j-1;
      if (j <= k) l=i;
    }
  }
}

int main()
{
  unsigned long i, j, k, l, m;
  unsigned long ntest = 1e2;
  unsigned long N[5] = {1e3, 1e4, 1e5, 1e6, 1e7};
  clock_t start, diff;

  int seed = 215342512; //time(NULL);
  srand(seed);

  point_t **ap, *a;

  ap = (point_t**) malloc(N[4] * sizeof(point_t*));

  if (ap == NULL) printf("Error in ap\n");

  a = (point_t*) malloc(N[4] * sizeof(point_t));

  if (a == NULL) printf("Error in a\n");

  for (i=0; i<N[4]; i++)
  {
    ap[i] = a+i;
  }

  for (i=0; i<5; i++)
  {
    start = clock();

    for (j=0; j<ntest; j++)
    {
      for (k=0; k<N[i]; k++)
      {
        ap[k]->x = (double) rand() / (double) RAND_MAX;
      }

      quickselect(N[i] / 2, N[i], ap);
    }

    diff = clock() - start;

    printf("%lu %.5f\n", N[i], (double) diff / CLOCKS_PER_SEC);
  }
}

给出：

1000 0.00224
10000 0.02587
100000 0.37574
1000000 7.18962
10000000 96.34863

两个版本都使用gcc -O2编译（但-O0给出相同的缩放比例）。

缩放的变化来自何处以及如何修复？

请注意，虽然我可以更改结构，但我不能只调整y的中位数，因为我还需要知道与中间点对应的其他参数。另外，我需要对结果数组进行快速选择行为（例如，对于a.y <= m.y左侧的所有a m以及b.y > m.y b权利m }）。

我需要计算中位数，并希望使用O（N）quickselect算法。然而事实证明，当数组不再是平面数组的双精度数，而是结构数组（其中一个元素是用于中值计算的元素）时，运行时间不再与O（N）成比例。

我使用以下实现：

#define SWAP(a,b) temp=(a); (a)=(b); (b)=temp;

typedef struct point_t point_t;

struct point_t {
  double y;

//  unsigned long something;
//
//  double *something_else;
//
//  double yet_another thing;
//
//  point_t* again_something;
};

void median(point_t *arr, unsigned long n)
{
  unsigned long k = n / 2;
  unsigned long i, ir, j, l, mid;
  point_t a, temp;

  l=0;
  ir=n-1;
  for (;;)
  {
    if (ir <= l+1)
    {
      if (ir == l+1 && arr[ir].y < arr[l].y)
      {
        SWAP(arr[l], arr[ir])
      }
      return arr + k;
    }
    else
    {
      mid = (l + ir) >> 1;
      SWAP(arr[mid], arr[l+1])
      if (arr[l].y > arr[ir].y)
      {
        SWAP(arr[l], arr[ir])
      }
      if (arr[l+1].y > arr[ir].y)
      {
        SWAP(arr[l+1], arr[ir])
      }
      if (arr[l].y > arr[l+1].y)
      {
        SWAP(arr[l], arr[l+1])
      }
      i = l+1;
      j = ir;
      a = arr[l+1];
      for (;;)
      {
        do i++; while (arr[i].y < a.y);
        do j--; while (arr[j].y > a.y);
        if (j < i) break;
        SWAP(arr[i],  arr[j])
      }
      arr[l+1] = arr[j];
      arr[j] = a;
      if (j >= k) ir = j-1;
      if (j <= k) l = i;
    }
  }
}

使用-O2结构被优化掉（我认为，至少缩放看起来与普通数组相同）并且缩放是线性的。但是，当取消注释结构的其他组件时，缩放不再是线性的。怎么会这样？怎么能解决这个问题？

请注意，虽然我可以更改结构，但我不能只调整y的中位数，因为我还需要知道与中间点对应的其他参数。另外，我需要对结果数组进行快速选择行为（例如，对于a.y <= m.y左侧的所有a m以及b.y > m.y b权利m }）。

Answer 1

我认为内存缓存mishits解释了执行时间的非线性增长。在我的x86_64架构PC（Linux + gcc）中，sizeof(double)为8，sizeof(point_t)为32，因此较少的元素适合高速缓存。但是，非线性增长的更大原因是通过代码中的指针数组对point_t结构的内存访问将很快高度随机化，并且由于这个原因而发生越来越多的缓存未命中...

我更改了代码如下：

--- test2.c
+++ test3.c
         -80,14 +80,12
     if (a == NULL)
         printf("Error in an");

-    for (i = 0; i < N[4]; i++) {
-        ap[i] = a + i;
-    }
- 
     for (i = 0; i < 5; i++) {
         start = clock();

         for (j = 0; j < ntest; j++) {
            for (k = 0; k < N[i]; k++) {
+                ap[k] = a + k;
                 ap[k]->x = (double) rand() / (double) RAND_MAX;
             }

并且执行时间增长更加线性。

带有quicselect()数组的原始double：

    1000 0.00000
   10000 0.04000
  100000 0.22000
 1000000 1.98000
10000000 20.73000

带有quickselect()数组的原始point_t *：

    1000 0.01000
   10000 0.02000
  100000 0.71000
 1000000 8.64000
10000000 157.77000

与上面的quickselect()数组完全相同的point_t *，但是在通过应用上述补丁调用quickselect()之前确保数组中的指针是连续的顺序：

    1000 0.00000
   10000 0.02000
  100000 0.40000
 1000000 4.71000
10000000 49.80000

请注意，即使修改后的版本在时序循环中进行了额外的排序，它仍然会更快。

我正在运行3.2GHz Pentium（R）双核CPU E6700,64位Linux，gcc 4.6，优化-O2。（我的机器没有闲置，所以我的基准测试数据有一些波动 - 如果我正在制定更严格的基准来计算内核未安排基准测试的时间，我会考虑使用clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...)来提高Linux系统的准确性要运行的过程。）

更新：例如，valgrind（如果您的平台支持）可用于分析缓存命中的影响。我修改了程序以获取两个参数，数组大小（对应于数组N[]的元素）和测试计数（对应于ntest）。没有valgrind的执行时间，其中test2基本上是问题中列出的未经修改的程序，test4是修改版本，在调用ap[]之前重新排列quickselect()数组功能：

bash$ ./test2 10000000 100
Array of 10000000 elements, 100 times: 154.40000 seconds
bash$ ./test4 10000000 100
Array of 10000000 elements, 100 times: 48.45000 seconds

以下是使用cachegrind工具运行valgrind的结果：

bash$ valgrind --tool=cachegrind ./test2 10000000 100
==23563== Cachegrind, a cache and branch-prediction profiler
==23563== Copyright (C) 2002-2010, and GNU GPL'd, by Nicholas Nethercote et al.
==23563== Using Valgrind-3.6.1-Debian and LibVEX; rerun with -h for copyright info
==23563== Command: ./test2 10000000 100
==23563== 
Array of 10000000 elements, 100 times: 1190.24000 seconds
==23563== 
==23563== I   refs:      80,075,384,594
==23563== I1  misses:             1,091
==23563== LLi misses:             1,087
==23563== I1  miss rate:           0.00%
==23563== LLi miss rate:           0.00%
==23563== 
==23563== D   refs:      36,670,292,139  (25,550,518,762 rd   + 11,119,773,377 wr)
==23563== D1  misses:     4,218,722,190  ( 3,223,975,942 rd   +    994,746,248 wr)
==23563== LLd misses:     4,190,889,241  ( 3,198,934,125 rd   +    991,955,116 wr)
==23563== D1  miss rate:           11.5% (          12.6%     +            8.9%  )
==23563== LLd miss rate:           11.4% (          12.5%     +            8.9%  )
==23563== 
==23563== LL refs:        4,218,723,281  ( 3,223,977,033 rd   +    994,746,248 wr)
==23563== LL misses:      4,190,890,328  ( 3,198,935,212 rd   +    991,955,116 wr)
==23563== LL miss rate:             3.5% (           3.0%     +            8.9%  )

和

bash$ valgrind --tool=cachegrind ./test4 10000000 100
==24436== Cachegrind, a cache and branch-prediction profiler
==24436== Copyright (C) 2002-2010, and GNU GPL'd, by Nicholas Nethercote et al.
==24436== Using Valgrind-3.6.1-Debian and LibVEX; rerun with -h for copyright info
==24436== Command: ./test4 10000000 100
==24436== 
Array of 10000000 elements, 100 times: 862.89000 seconds
==24436== 
==24436== I   refs:      82,985,384,485
==24436== I1  misses:             1,093
==24436== LLi misses:             1,089
==24436== I1  miss rate:           0.00%
==24436== LLi miss rate:           0.00%
==24436== 
==24436== D   refs:      36,640,292,192  (24,530,518,829 rd   + 12,109,773,363 wr)
==24436== D1  misses:     2,814,232,350  ( 2,189,229,679 rd   +    625,002,671 wr)
==24436== LLd misses:     2,796,287,872  ( 2,171,294,250 rd   +    624,993,622 wr)
==24436== D1  miss rate:            7.6% (           8.9%     +            5.1%  )
==24436== LLd miss rate:            7.6% (           8.8%     +            5.1%  )
==24436== 
==24436== LL refs:        2,814,233,443  ( 2,189,230,772 rd   +    625,002,671 wr)
==24436== LL misses:      2,796,288,961  ( 2,171,295,339 rd   +    624,993,622 wr)
==24436== LL miss rate:             2.3% (           2.0%     +            5.1%  )

有关如何阅读这些统计信息，请参阅Valgrind manual。关键点在于：“在现代机器上，L1未命中通常需要大约10个周期，LL未命中可能需要多达200个周期”。计算两种情况之间的LLd（最后一级数据缓存）misshit成本差异（每次3.2s9 CPU的每3.2e9周期/秒的假设200个周期的每个差异）给出

bash$ echo $(( (4190889241 - 2796287872) * 200 / 3200000000 )) seconds
87 seconds

D1失误在这里贡献很少（如果D1失误成本与LLd失误成本无关，则总共91秒）;由于我们所有的不准确（最明显的是这台计算机中LLd misshit的实际成本），D1失误可能会被忽略。

test2和test4的运行时差异大约为 106秒，相当接近上述86秒。这一切都可以更加准确，但这似乎已经证明了测试安排中缓存未命中的影响。

P.S。 valgrind写一个日志文件，我可以在其中验证它是否正确检测到L2和L1缓存大小和类型。

Answer 2

看起来你正在排序和交换实际结构，这意味着你正在移动更多的东西。您应该尝试使用指针数组对结构进行排序，然后交换它们。

所以你的中位函数会有一个原型：

void median(point_t **arr, unsigned long n);

然后你必须分配并释放结构并在调用之前将它们的指针放在数组中，而不是直接用结构填充数组的元素。

使用结构数组进行快速选择具有非线性运行时间

2 个答案: