Question

我在堆排序例程中发现了一个奇怪的行为（见下文）。

void hpsort(unsigned long n, double *data)
{
  unsigned long i, ir, j, l;
  double rra;

  if (n < 2) return;
  l = (n - 2) / 2 + 1;
  ir = n - 1;

  for (;;)
    {
      if (l > 0) rra = data[--l];
      else
        {
          rra = data[ir];
          data[ir] = data[0];
          if (--ir == 0) { data[0] = rra; break; }
        }

      i = l;
      j = l + l + 1;
      while (j <= ir)
        {
          if (j < ir && data[j] < data[j+1]) ++j;
          if (rra < data[j])
            {
              data[i] = data[j];
              i = j;
              j += j + 1;
            }
          else break;
        }
      data[i] = rra;
    }

  return;
}

如果我像这样做一个基准调用这个例程

double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array);

需要X秒。但如果我只添加一行

void hpsort(unsigned int n, double *data)
{
   ++data;

并做基准测试

double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array-1);

大约需要0.96X秒（即快4％）。从运行到运行，这种性能差异是稳定的。

感觉g++编译器确实在第一种情况下检查边界，而在第二种情况下我可以以某种方式欺骗它。但是我从来没有听说过C数组的边界检查......

为什么我会在性能上遇到这种奇怪的差异？

P.S。使用g++ -O2完成编译。顺便说一句，将unsigned long更改为long int也会使效果降低约3％至4％。

p.p.s。＆＃34;定义的行为＆＃34;版本还显示了性能改进

void hpsort(unsigned int n, double *data)
{
   --data;

和基准

double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array+1);

p.p.p.s。绩效比较

Size of array   Faster  Slower
        10      1.46    1.60
       100      1.41    1.62
      1000      1.84    1.96
     10000      1.78    1.87
    100000      1.72    1.80
   1000000      1.76    1.83
  10000000      1.98    2.03

这是我的hpsort.cpp

代码

 void hpsort1(unsigned long n, double *data)
 {
   unsigned long i, ir, j, l;
   double rra;

   if (n < 2) return;
   l = (n - 2) / 2 + 1;
   ir = n - 1;

   for (;;)
     {
       if (l > 0) rra = data[--l];
       else
         {
           rra = data[ir];
           data[ir] = data[0];
           if (--ir == 0)
             {
               data[0] = rra;
               break;
             }
         }

       i = l;
       j = l + l + 1;
       while (j <= ir)
         {
           if (j < ir && data[j] < data[j+1]) ++j;
           if (rra < data[j])
             {
               data[i] = data[j];
               i = j;
               j += j + 1;
             }
           else break;
         }
       data[i] = rra;
     }
   return;
 }

 void hpsort2(unsigned long n, double *data)
 {
   unsigned long i, ir, j, l;
   double rra;

   --data;

   if (n < 2) return;
   l = (n - 2) / 2 + 1;
   ir = n - 1;

   for (;;)
     {
       if (l > 0) rra = data[--l];
       else
         {
           rra = data[ir];
           data[ir] = data[0];
           if (--ir == 0)
             {
               data[0] = rra;
               break;
             }
         }

       i = l;
       j = l + l + 1;
       while (j <= ir)
         {
           if (j < ir && data[j] < data[j+1]) ++j;
           if (rra < data[j])
             {
               data[i] = data[j];
               i = j;
               j += j + 1;
             }
           else break;
         }
       data[i] = rra;
     }
   return;
 }

这是我的基准代码heapsort-benchmark.cpp

 #include <vector>
 #include <alloca.h>
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <time.h>
 #include <math.h>

 using namespace std;

 void hpsort1(unsigned long n, double *data);
 void hpsort2(unsigned long n, double *data);

 typedef double element_t;
 typedef void(*Test)(element_t*, element_t*, int);

 const int sizes [] = {10, 100, 1000, 10000, 100000, 1000000, 10000000};
 const int largest_size = sizes[sizeof(sizes)/sizeof(int)-1];

 vector<double> result_times; // results are pushed into this vector

 clock_t start_time;

 void do_row(int size) // print results for given size of processed array
 {
   printf("%10d  \t", size);
   for (int i=0; i<result_times.size(); ++i) printf("%.2f\t", result_times[i]);
   printf("\n");
   result_times.clear();
 }

 inline void start_timer() { start_time = clock(); }

 inline double timer()
 {
   clock_t end_time = clock();
   return (end_time - start_time)/double(CLOCKS_PER_SEC);
 }

 void run(Test f, element_t* first, element_t* last, int number_of_times)
 {
   start_timer();
   while (number_of_times-- > 0) f(first,last,number_of_times);
   result_times.push_back(timer());
 }

 void random_shuffle(double *first, double *last)
 {
   size_t i, j, n;
   double tmp;
   n = last-first;

   srand((unsigned int)0);

   for (i=n-1; i>0; --i)
     {
       j = rand() % (i+1);
       tmp = first[i];
       first[i] = first[j];
       first[j] = tmp;
     }
   return;
 }

 void hpsort1_test(element_t* first, element_t* last, int number_of_times)
 {
   size_t num_elements = (last-first);
   element_t* array = (element_t*)malloc(sizeof(element_t)*num_elements);
   memcpy(array, first, sizeof(element_t)*num_elements);
   hpsort1(num_elements, array);
   free(array);
 }

 void hpsort2_test(element_t* first, element_t* last, int number_of_times)
 {
   size_t num_elements = (last-first);
   element_t* array = (element_t*)malloc(sizeof(element_t)*num_elements);
   memcpy(array, first, sizeof(element_t)*num_elements);
   hpsort2(num_elements, array+1);
   free(array);
 }

 void initialize(element_t* first, element_t* last)
 {
   element_t x = 0.;
   while (first != last) { *first++ = x; x += 1.; }
 }

 double logtwo(double x) { return log(x)/log((double) 2.0); }

 int number_of_tests(int size)
 {
   double n = (double)size;
   double largest_n = (double)largest_size;
   return int(floor((largest_n * logtwo(largest_n)) / (n * logtwo(n))));
 }

 void run_tests(int size)
 {
   const int n = number_of_tests(size);

   element_t *buffer = (element_t *)malloc(size * sizeof(element_t));
   element_t* buffer_end = &buffer[size];

   initialize(buffer, buffer + size); // fill in the elements

   for (int i = 0; i < size/2; ++i) buffer[size/2 + i] = buffer[i]; // fill in the second half with values of the first half
   //random_shuffle(buffer, buffer_end); // shuffle if you do not want an ordered array

   run(hpsort2_test, buffer, buffer_end, n);
   run(hpsort1_test, buffer, buffer_end, n);

   do_row(size);

   free(buffer);
 }


 int main()
 {
   const int n = sizeof(sizes)/sizeof(int);
   for (int i = 0; i < n; ++i) run_tests(sizes[i]);
 }

我编译并以

运行它

g++ -O2 -c heapsort-benchmark.cpp
g++ -O2 -c hpsort.cpp
g++ -O2 -o heapsort-benchmark heapsort-benchmark.o hpsort.o 
./heapsort-benchmark

第一列将是更快的版本

Answer 1

无法像OP一样获得一致的结果。

IMO OP的微小差异不是代码差异的一部分，而是测试的工件。

void hpsort(unsigned long n, double *data) {
  unsigned long i, ir, j, l;
  double rra;
  ...
}

void hpsort1(unsigned long n, double *data) {
  --data;
  unsigned long i, ir, j, l;
  double rra;
  ...
}

测试代码

#include <time.h>
#include <stdlib.h>

void test(const char *s, int code, size_t n) {
  srand(0);
  double* array = (double*) malloc(sizeof(double) * n * 2);
  // make 2 copies of same random data
  for (size_t i = 0; i < n; i++) {
    array[i] = rand();
    array[i+n] = array[i];
  }

  double dt0;
  double dt1;
  clock_t c0 = clock();
  clock_t c1,c2;
  if (code) {
    hpsort1(n, array + 1);
    c1 = clock();
    hpsort(n, &array[n]);
    c2 = clock();
    dt0 = (double) (c2 - c1)/CLOCKS_PER_SEC;
    dt1 = (double) (c1 - c0)/CLOCKS_PER_SEC;
  } else {
    hpsort(n, array);
    c1 = clock();
    hpsort1(n, &array[n]+1);
    c2 = clock();
    dt0 = (double) (c1 - c0)/CLOCKS_PER_SEC;
    dt1 = (double) (c2 - c1)/CLOCKS_PER_SEC;
  }
  free(array);
  const char *cmp = dt0==dt1 ? "==" : (dt0<dt1 ? "<" : ">");
  printf("%s %f %2s %f  Diff:% f%%\n", s, dt0, cmp,  dt1, 100*(dt1-dt0)/dt0);
}

int main() {
  //srand((unsigned) time(0));
  size_t n = 3000000;
  for (int i=0; i<10; i++) {
    test("heap  first", 0, n);
    test("heap1 first", 1, n);
    fflush(stdout);
  }
}

输出

heap  first 1.263000  > 1.201000  Diff:-4.908947%
heap1 first 1.295000  < 1.326000  Diff: 2.393822%
heap  first 1.342000  > 1.295000  Diff:-3.502235%
heap1 first 1.279000  < 1.295000  Diff: 1.250977%
heap  first 1.279000 == 1.279000  Diff: 0.000000%
heap1 first 1.280000  > 1.279000  Diff:-0.078125%
heap  first 1.295000  > 1.294000  Diff:-0.077220%
heap1 first 1.280000  > 1.279000  Diff:-0.078125%
heap  first 1.279000 == 1.279000  Diff: 0.000000%
heap1 first 1.295000  > 1.279000  Diff:-1.235521%
heap  first 1.263000  < 1.295000  Diff: 2.533650%
heap1 first 1.280000  > 1.279000  Diff:-0.078125%
heap  first 1.295000  > 1.263000  Diff:-2.471042%
heap1 first 1.295000  < 1.310000  Diff: 1.158301%
heap  first 1.310000  < 1.326000  Diff: 1.221374%
heap1 first 1.326000  < 1.342000  Diff: 1.206637%
heap  first 1.279000 == 1.279000  Diff: 0.000000%
heap1 first 1.264000  < 1.295000  Diff: 2.452532%
heap  first 1.279000  > 1.264000  Diff:-1.172791%
heap1 first 1.279000  > 1.264000  Diff:-1.172791%

带有数组指针的C ++简单技巧提高了性能

1 个答案: