Question

这是我用命令在Linux中编译的测试代码

g ++ main.cpp -O3 -o stest

我尝试了两种方法来合并数据（ test2 test3 ）。但是，这两种方式都没有达到我预期的更好的性能。我认为，组合数据应比独立阵列具有更好的性能，因为较高的缓存按块从较低的缓存加载数据。因此，合并的数据有更多的机会被加载到一个内存访问中。但是，独立阵列（ test1 ）需要三个内存访问。但是，测试结果表明 test1 具有最佳性能。对我来说太奇怪了。而且，我不知道为什么。如果您知道，请告诉我。预先感谢。

#include <iostream>
#include <cstdlib>
#include <unistd.h>
#include <string.h>
#include <sstream>
#include <sys/times.h>
#include <cmath>
using namespace std;

tms start, tEnd;

long long test1(int n) {
    int *a = new int[n];
    int *b = new int[n];
    int *c = new int[n];


    times(&start);
    for (int i = 0; i < n; i++) {
        a[i] = b[i] = i;
    }

    for (int i = 0; i < n; i++) {
        c[i] = a[i] + b[i];
    }
    long long sum = 0;
    for (int i = 0; i < n; i++) {
        sum += c[i];
    }

    times(&tEnd);

    double elap_time = double(tEnd.tms_utime - start.tms_utime + tEnd.tms_stime - start.tms_stime) / sysconf(_SC_CLK_TCK);
    cout << "test1: " << elap_time << "ms  result=" << sum << " " << endl;
    delete[] a;
    delete[] b;
    delete[] c;
    return sum;
}

struct D {
    int a, b, c;
};

long long test2(int n) {
    struct D *d = new D[n];

    times(&start);
    for (int i = 0; i < n; i++) {
        struct D &di = d[i];
        di.a = di.b = i;
    }

    for (int i = 0; i < n; i++) {
        struct D &di = d[i];
        di.c = di.a + di.b;
    }
    long long sum = 0;
    for (int i = 0; i < n; i++) {
        sum += d[i].c;
    }

    times(&tEnd);
    double elap_time = double(tEnd.tms_utime - start.tms_utime + tEnd.tms_stime - start.tms_stime) / sysconf(_SC_CLK_TCK);
    cout << "test2: " << elap_time << "ms  result=" << sum << " " << endl;
    delete [] d;
    return sum;
}

long long test3(int n) {
    int *abc = new int[3 * n];

    times(&start);
    for (int i = 0; i < n; i++) {
        int base = 3 * i;
        abc[base] = abc[base + 1] = i;
    }

    for (int i = 0; i < n; i++) {
        int base = 3 * i;
        abc[base + 2] = abc[base] + abc[base + 1];
    }

    long long sum = 0;
    for (int i = 0; i < n; i++) {
        sum += abc[3 * i + 2];
    }

    times(&tEnd);
    double elap_time = double(tEnd.tms_utime - start.tms_utime + tEnd.tms_stime - start.tms_stime) / sysconf(_SC_CLK_TCK);
    cout << "test3: " << elap_time << "ms  result=" << sum << " " << endl;
    delete [] abc;
    return sum;
}


int main(int argc, char *argv[]) {
    int n = 9999999;
    sscanf(argv[1], "%d", &n);
    test1(n);
    test2(n);
    test3(n);

    cout<<"after changing order"<<endl;

    test2(n);
    test3(n);
    test1(n);

    cout<<"after changing order"<<endl;
    test3(n);
    test1(n);
    test2(n);

    return 0;
}

我在具有四个i5-4460 CPU和 8GB 内存的计算机上测试了 stest 。这是我用来测试程序的命令，并且我确定使用参数 399999999 ，计算机不会内存不足：

q@q-lab:~/Desktop$ ./stest 399999999
test1: 1.61ms  result=159999998800000002 
test2: 2.38ms  result=159999998800000002 
test3: 2.37ms  result=159999998800000002 
after changing order
test2: 2.38ms  result=159999998800000002 
test3: 2.38ms  result=159999998800000002 
test1: 1.61ms  result=159999998800000002 
after changing order
test3: 2.38ms  result=159999998800000002 
test1: 1.61ms  result=159999998800000002 
test2: 2.39ms  result=159999998800000002

Answer 1

合并数据需要更多时间来寻址或计算数据偏移量。而且，CPU缓存是不可预测的，并且难以优化。最好不要尝试优化CPU缓存。

为什么c ++中的组合数据结构比独立数组的性能要低

1 个答案: