我在堆排序例程中发现了一个奇怪的行为(见下文)。
void hpsort(unsigned long n, double *data)
{
unsigned long i, ir, j, l;
double rra;
if (n < 2) return;
l = (n - 2) / 2 + 1;
ir = n - 1;
for (;;)
{
if (l > 0) rra = data[--l];
else
{
rra = data[ir];
data[ir] = data[0];
if (--ir == 0) { data[0] = rra; break; }
}
i = l;
j = l + l + 1;
while (j <= ir)
{
if (j < ir && data[j] < data[j+1]) ++j;
if (rra < data[j])
{
data[i] = data[j];
i = j;
j += j + 1;
}
else break;
}
data[i] = rra;
}
return;
}
如果我像这样做一个基准调用这个例程
double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array);
需要X秒。但如果我只添加一行
void hpsort(unsigned int n, double *data)
{
++data;
并做基准测试
double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array-1);
大约需要0.96X秒(即快4%)。从运行到运行,这种性能差异是稳定的。
感觉g++
编译器确实在第一种情况下检查边界,而在第二种情况下我可以以某种方式欺骗它。但是我从来没有听说过C数组的边界检查......
为什么我会在性能上遇到这种奇怪的差异?
P.S。使用g++ -O2
完成编译。顺便说一句,将unsigned long
更改为long int
也会使效果降低约3%至4%。
p.p.s。 &#34;定义的行为&#34;版本还显示了性能改进
void hpsort(unsigned int n, double *data)
{
--data;
和基准
double* array = (double*)malloc(sizeof(double) * N);
... fill in the array ...
hpsort(N, array+1);
p.p.p.s。绩效比较
Size of array Faster Slower
10 1.46 1.60
100 1.41 1.62
1000 1.84 1.96
10000 1.78 1.87
100000 1.72 1.80
1000000 1.76 1.83
10000000 1.98 2.03
这是我的hpsort.cpp
void hpsort1(unsigned long n, double *data)
{
unsigned long i, ir, j, l;
double rra;
if (n < 2) return;
l = (n - 2) / 2 + 1;
ir = n - 1;
for (;;)
{
if (l > 0) rra = data[--l];
else
{
rra = data[ir];
data[ir] = data[0];
if (--ir == 0)
{
data[0] = rra;
break;
}
}
i = l;
j = l + l + 1;
while (j <= ir)
{
if (j < ir && data[j] < data[j+1]) ++j;
if (rra < data[j])
{
data[i] = data[j];
i = j;
j += j + 1;
}
else break;
}
data[i] = rra;
}
return;
}
void hpsort2(unsigned long n, double *data)
{
unsigned long i, ir, j, l;
double rra;
--data;
if (n < 2) return;
l = (n - 2) / 2 + 1;
ir = n - 1;
for (;;)
{
if (l > 0) rra = data[--l];
else
{
rra = data[ir];
data[ir] = data[0];
if (--ir == 0)
{
data[0] = rra;
break;
}
}
i = l;
j = l + l + 1;
while (j <= ir)
{
if (j < ir && data[j] < data[j+1]) ++j;
if (rra < data[j])
{
data[i] = data[j];
i = j;
j += j + 1;
}
else break;
}
data[i] = rra;
}
return;
}
这是我的基准代码heapsort-benchmark.cpp
#include <vector>
#include <alloca.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
using namespace std;
void hpsort1(unsigned long n, double *data);
void hpsort2(unsigned long n, double *data);
typedef double element_t;
typedef void(*Test)(element_t*, element_t*, int);
const int sizes [] = {10, 100, 1000, 10000, 100000, 1000000, 10000000};
const int largest_size = sizes[sizeof(sizes)/sizeof(int)-1];
vector<double> result_times; // results are pushed into this vector
clock_t start_time;
void do_row(int size) // print results for given size of processed array
{
printf("%10d \t", size);
for (int i=0; i<result_times.size(); ++i) printf("%.2f\t", result_times[i]);
printf("\n");
result_times.clear();
}
inline void start_timer() { start_time = clock(); }
inline double timer()
{
clock_t end_time = clock();
return (end_time - start_time)/double(CLOCKS_PER_SEC);
}
void run(Test f, element_t* first, element_t* last, int number_of_times)
{
start_timer();
while (number_of_times-- > 0) f(first,last,number_of_times);
result_times.push_back(timer());
}
void random_shuffle(double *first, double *last)
{
size_t i, j, n;
double tmp;
n = last-first;
srand((unsigned int)0);
for (i=n-1; i>0; --i)
{
j = rand() % (i+1);
tmp = first[i];
first[i] = first[j];
first[j] = tmp;
}
return;
}
void hpsort1_test(element_t* first, element_t* last, int number_of_times)
{
size_t num_elements = (last-first);
element_t* array = (element_t*)malloc(sizeof(element_t)*num_elements);
memcpy(array, first, sizeof(element_t)*num_elements);
hpsort1(num_elements, array);
free(array);
}
void hpsort2_test(element_t* first, element_t* last, int number_of_times)
{
size_t num_elements = (last-first);
element_t* array = (element_t*)malloc(sizeof(element_t)*num_elements);
memcpy(array, first, sizeof(element_t)*num_elements);
hpsort2(num_elements, array+1);
free(array);
}
void initialize(element_t* first, element_t* last)
{
element_t x = 0.;
while (first != last) { *first++ = x; x += 1.; }
}
double logtwo(double x) { return log(x)/log((double) 2.0); }
int number_of_tests(int size)
{
double n = (double)size;
double largest_n = (double)largest_size;
return int(floor((largest_n * logtwo(largest_n)) / (n * logtwo(n))));
}
void run_tests(int size)
{
const int n = number_of_tests(size);
element_t *buffer = (element_t *)malloc(size * sizeof(element_t));
element_t* buffer_end = &buffer[size];
initialize(buffer, buffer + size); // fill in the elements
for (int i = 0; i < size/2; ++i) buffer[size/2 + i] = buffer[i]; // fill in the second half with values of the first half
//random_shuffle(buffer, buffer_end); // shuffle if you do not want an ordered array
run(hpsort2_test, buffer, buffer_end, n);
run(hpsort1_test, buffer, buffer_end, n);
do_row(size);
free(buffer);
}
int main()
{
const int n = sizeof(sizes)/sizeof(int);
for (int i = 0; i < n; ++i) run_tests(sizes[i]);
}
我编译并以
运行它g++ -O2 -c heapsort-benchmark.cpp
g++ -O2 -c hpsort.cpp
g++ -O2 -o heapsort-benchmark heapsort-benchmark.o hpsort.o
./heapsort-benchmark
第一列将是更快的版本
答案 0 :(得分:1)
无法像OP一样获得一致的结果。
IMO OP的微小差异不是代码差异的一部分,而是测试的工件。
void hpsort(unsigned long n, double *data) {
unsigned long i, ir, j, l;
double rra;
...
}
void hpsort1(unsigned long n, double *data) {
--data;
unsigned long i, ir, j, l;
double rra;
...
}
测试代码
#include <time.h>
#include <stdlib.h>
void test(const char *s, int code, size_t n) {
srand(0);
double* array = (double*) malloc(sizeof(double) * n * 2);
// make 2 copies of same random data
for (size_t i = 0; i < n; i++) {
array[i] = rand();
array[i+n] = array[i];
}
double dt0;
double dt1;
clock_t c0 = clock();
clock_t c1,c2;
if (code) {
hpsort1(n, array + 1);
c1 = clock();
hpsort(n, &array[n]);
c2 = clock();
dt0 = (double) (c2 - c1)/CLOCKS_PER_SEC;
dt1 = (double) (c1 - c0)/CLOCKS_PER_SEC;
} else {
hpsort(n, array);
c1 = clock();
hpsort1(n, &array[n]+1);
c2 = clock();
dt0 = (double) (c1 - c0)/CLOCKS_PER_SEC;
dt1 = (double) (c2 - c1)/CLOCKS_PER_SEC;
}
free(array);
const char *cmp = dt0==dt1 ? "==" : (dt0<dt1 ? "<" : ">");
printf("%s %f %2s %f Diff:% f%%\n", s, dt0, cmp, dt1, 100*(dt1-dt0)/dt0);
}
int main() {
//srand((unsigned) time(0));
size_t n = 3000000;
for (int i=0; i<10; i++) {
test("heap first", 0, n);
test("heap1 first", 1, n);
fflush(stdout);
}
}
输出
heap first 1.263000 > 1.201000 Diff:-4.908947%
heap1 first 1.295000 < 1.326000 Diff: 2.393822%
heap first 1.342000 > 1.295000 Diff:-3.502235%
heap1 first 1.279000 < 1.295000 Diff: 1.250977%
heap first 1.279000 == 1.279000 Diff: 0.000000%
heap1 first 1.280000 > 1.279000 Diff:-0.078125%
heap first 1.295000 > 1.294000 Diff:-0.077220%
heap1 first 1.280000 > 1.279000 Diff:-0.078125%
heap first 1.279000 == 1.279000 Diff: 0.000000%
heap1 first 1.295000 > 1.279000 Diff:-1.235521%
heap first 1.263000 < 1.295000 Diff: 2.533650%
heap1 first 1.280000 > 1.279000 Diff:-0.078125%
heap first 1.295000 > 1.263000 Diff:-2.471042%
heap1 first 1.295000 < 1.310000 Diff: 1.158301%
heap first 1.310000 < 1.326000 Diff: 1.221374%
heap1 first 1.326000 < 1.342000 Diff: 1.206637%
heap first 1.279000 == 1.279000 Diff: 0.000000%
heap1 first 1.264000 < 1.295000 Diff: 2.452532%
heap first 1.279000 > 1.264000 Diff:-1.172791%
heap1 first 1.279000 > 1.264000 Diff:-1.172791%