这是我用命令在Linux中编译的测试代码
g ++ main.cpp -O3 -o stest
我尝试了两种方法来合并数据( test2 test3 )。但是,这两种方式都没有达到我预期的更好的性能。我认为,组合数据应比独立阵列具有更好的性能,因为较高的缓存按块从较低的缓存加载数据。因此,合并的数据有更多的机会被加载到一个内存访问中。但是,独立阵列( test1 )需要三个内存访问。但是,测试结果表明 test1 具有最佳性能。对我来说太奇怪了。而且,我不知道为什么。如果您知道,请告诉我。预先感谢。
#include <iostream>
#include <cstdlib>
#include <unistd.h>
#include <string.h>
#include <sstream>
#include <sys/times.h>
#include <cmath>
using namespace std;
tms start, tEnd;
long long test1(int n) {
int *a = new int[n];
int *b = new int[n];
int *c = new int[n];
times(&start);
for (int i = 0; i < n; i++) {
a[i] = b[i] = i;
}
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
long long sum = 0;
for (int i = 0; i < n; i++) {
sum += c[i];
}
times(&tEnd);
double elap_time = double(tEnd.tms_utime - start.tms_utime + tEnd.tms_stime - start.tms_stime) / sysconf(_SC_CLK_TCK);
cout << "test1: " << elap_time << "ms result=" << sum << " " << endl;
delete[] a;
delete[] b;
delete[] c;
return sum;
}
struct D {
int a, b, c;
};
long long test2(int n) {
struct D *d = new D[n];
times(&start);
for (int i = 0; i < n; i++) {
struct D &di = d[i];
di.a = di.b = i;
}
for (int i = 0; i < n; i++) {
struct D &di = d[i];
di.c = di.a + di.b;
}
long long sum = 0;
for (int i = 0; i < n; i++) {
sum += d[i].c;
}
times(&tEnd);
double elap_time = double(tEnd.tms_utime - start.tms_utime + tEnd.tms_stime - start.tms_stime) / sysconf(_SC_CLK_TCK);
cout << "test2: " << elap_time << "ms result=" << sum << " " << endl;
delete [] d;
return sum;
}
long long test3(int n) {
int *abc = new int[3 * n];
times(&start);
for (int i = 0; i < n; i++) {
int base = 3 * i;
abc[base] = abc[base + 1] = i;
}
for (int i = 0; i < n; i++) {
int base = 3 * i;
abc[base + 2] = abc[base] + abc[base + 1];
}
long long sum = 0;
for (int i = 0; i < n; i++) {
sum += abc[3 * i + 2];
}
times(&tEnd);
double elap_time = double(tEnd.tms_utime - start.tms_utime + tEnd.tms_stime - start.tms_stime) / sysconf(_SC_CLK_TCK);
cout << "test3: " << elap_time << "ms result=" << sum << " " << endl;
delete [] abc;
return sum;
}
int main(int argc, char *argv[]) {
int n = 9999999;
sscanf(argv[1], "%d", &n);
test1(n);
test2(n);
test3(n);
cout<<"after changing order"<<endl;
test2(n);
test3(n);
test1(n);
cout<<"after changing order"<<endl;
test3(n);
test1(n);
test2(n);
return 0;
}
我在具有四个i5-4460 CPU和 8GB 内存的计算机上测试了 stest 。这是我用来测试程序的命令,并且我确定使用参数 399999999 ,计算机不会内存不足:
q@q-lab:~/Desktop$ ./stest 399999999
test1: 1.61ms result=159999998800000002
test2: 2.38ms result=159999998800000002
test3: 2.37ms result=159999998800000002
after changing order
test2: 2.38ms result=159999998800000002
test3: 2.38ms result=159999998800000002
test1: 1.61ms result=159999998800000002
after changing order
test3: 2.38ms result=159999998800000002
test1: 1.61ms result=159999998800000002
test2: 2.39ms result=159999998800000002
答案 0 :(得分:-1)
合并数据需要更多时间来寻址或计算数据偏移量。 而且,CPU缓存是不可预测的,并且难以优化。最好不要尝试优化CPU缓存。