Question

我们正在尝试优化Java中的大量内存操作，并遇到一些异常情况。根据我们的数据，我们得出以下假设：一个数组/内存块可能是由于大量访问而加载到CPU缓存中的，但是在多次克隆该数组之后，缓存将变满，并将初始数组移回RAM。 / p>

为了测试这一点，我们建立了一个基准。它执行以下操作：

创建具有给定大小的数组
将一些数据写入字段
读取/迭代一百万次（以将其推送到CPU缓存中）
将其克隆到一个新数组中
将新数组克隆到新数组中，并在给定的次数下将新数组用于下一次

此外，在每个步骤之后，将数组迭代3次，并测量每次迭代所需的时间。这是代码：

private static long[] read(byte[] array, int count, boolean logTimes) {
    long[] times = null;
    if (logTimes) {
        times = new long[count];
    }
    int sum = 0;
    for (int n = 0; n < count; n++) {
        long start = System.nanoTime();
        for (int i = 0; i < array.length; i++) {
            sum += array[i];
        }
        if (logTimes) {
            long time = System.nanoTime() - start;
            times[n] = time;
        }
    }
    System.out.println(sum);
    return times;
}

public static void main(String[] args) {
    int arraySize = Integer.parseInt(args[0]);
    int clones = Integer.parseInt(args[1]);
    byte[] array = new byte[arraySize];
    long[] initialReadTimes = read(array, 3, true);
    // Fill with some non-zero content
    for (int i = 0; i < array.length; i++) {
        array[i] = (byte) i;
    }
    long[] afterWriteTimes = read(array, 3, true);

    // Make this array important, so it lands in CPU Cache
    read(array, 1_000_000, false);
    long[] afterReadTimes = read(array, 3, true);

    long[] afterFirstCloneReadTimes = null;
    byte[] copy = new byte[array.length];
    System.arraycopy(array, 0, copy, 0, array.length);
    for (int i = 1; i <= clones; i++) {
        byte[] copy2 = new byte[copy.length];
        System.arraycopy(copy, 0, copy2, 0, copy.length);
        copy = copy2;
        if (i == 1) {
            afterFirstCloneReadTimes = read(array, 3, true);
        }
    }

    long[] afterAllClonesReadTimes = read(array, 3, true);

    // Write to CSV
    ...
    System.out.println("Finished.");
}

我们在具有16 GB RAM的第二代i5上以arraysize = 10,000和克隆= 10,000,000运行了该基准测试：

尽管变化很大，但第二轮和第三轮有时会有不同的时间，或者最后一次阅读基准的第二轮和第三轮出现峰值。

这些结果似乎很令人困惑。我认为这可能表明在数组初始化后，由于初始读取时间相对较长，因此不会立即将其加载到CPU缓存中。写完之后，似乎什么都没有改变。仅在进行大量迭代之后，访问时间才变得更快，而第一次运行总是更慢（因为读数之间运行的测量开销？）。同样，用新阵列克隆/填充内存似乎也完全没有影响。谁能解释这些结果？

我们认为其中某些原因可能来自于Java特定的内存管理，因此我们尝试在C ++中重新实现基准：

void read(unsigned char array[], int length, int count, std::vector<long int> & logTimes) {
    for (int c = 0; c < count; c++) {
        int sum = 0;
        std::chrono::high_resolution_clock::time_point t1;
        if (count <= 3) {
            t1 = std::chrono::high_resolution_clock::now();
        }
        for (int i = 0; i < length; i++) {
            sum += array[i];
        }
        if (count <= 3) {
            std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
            long int duration = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count();
            std::cout << duration << " ns\n";
            logTimes.push_back(duration);
        }
    }
}

int main(int argc, char ** args)
{
    int ARRAYSIZE = 10000;
    int CLONES = 10000000;
    std::vector<long int> initialTimes, afterWritingTimes, afterReadTimes, afterFirstCloneTimes, afterCloneTimes, null;
    unsigned char array[ARRAYSIZE];
    read(array, ARRAYSIZE, 3, initialTimes);
    for (long long i = 0; i < ARRAYSIZE; i++) {
        array[i] = i;
    }
    std::cout << "Reads after writing:\n";
    read(array, ARRAYSIZE, 3, afterWritingTimes);

    read(array, ARRAYSIZE, 1000000, null);
    std::cout << "Reads after 1M Reads:\n";
    read(array, ARRAYSIZE, 3, afterReadTimes);

    unsigned char copy[ARRAYSIZE];
    unsigned char * ptr_copy = copy;
    std::memcpy(ptr_copy, array, ARRAYSIZE);
    for (long long i = 0; i < CLONES; i++) {
        unsigned char copy2[ARRAYSIZE];
        std::memcpy(copy2, ptr_copy, ARRAYSIZE);
        ptr_copy = copy2;
        if (i == 0) {
            read(array, ARRAYSIZE, 3, afterFirstCloneTimes);
        }
    }
    std::cout << "Reads after cloning:\n";
    read(array, ARRAYSIZE, 3, afterCloneTimes);

    writeTimesToCSV(initialTimes, afterWritingTimes, afterReadTimes, afterFirstCloneTimes, afterCloneTimes);
    std::cout << "Finished.\n";
}

使用相同的参数，我们得到以下结果：

因此在C ++中，时间彼此相似，在第二次运行中出现了一些奇怪的高峰。这似乎表明，上述更快的计时是由Java优化（或在最初的阅读中不太理想的处理）引起的。这是否意味着根本不涉及CPU缓存？

CPU缓存/内存访问时间异常

0 个答案: