Question

我的问题是基于另一个SO问题：Why does _mm_stream_ps produce L1/LL cache misses?

在阅读并被它吸引之后，我尝试复制结果并亲眼看看哪个更快：天真循环，展开的幼稚循环，_mm_stream_ps（展开），_mm_store_ps（展开）最后但并非最不重要memset_pattern4。（最后一个采用4字节模式，例如浮点数，并在目标数组上填充它，这应该与所有其他函数相同，但它可能是OS X独占的。）

我确保将数组的开头与高速缓存行对齐（64字节，我检查过）并在参数中传递数组以及上一个问题中提到的任何其他性能调整。 / p>

其他人想在gamedev上知道同样的事情：http://www.gamedev.net/topic/532112-fast-memset/

该线程的结论反映了我自己：当目标阵列小于最大（L3）缓存时，_mm_store_ps比_mm_stream_ps快。当目标数组较大时，_mm_stream_ps更快。我并不完全确定为什么__mm_store_ps在第一种情况下更快，因为我从不在缓存中使用这些值，但我明白为什么_mm_stream_ps在后一种情况下胜出。它是针对这种情况做的：将字节写入你不需要立即（或永远）的内存。

以下是目标数组比L3缓存大256倍（在我的情况下为1.5GB），使用gcc 4.8编译的结果：

gcc-4.8 stream.c -o stream -std=c11 -O3 -g3 -ftree-vectorize -march=native -minline-all-stringops && ./stream

bench L3-MASS, array 1610612736 bytes (402653184 floats, 0 remainder, 0x104803040 pointer)
warm up round...
      6% (  20.81148 ms) : MEMSET CHEAT
      8% (  28.49419 ms) : MEMSET PATTER
    100% ( 371.40385 ms) : NAIVE  NORMAL
     54% ( 202.01147 ms) : NAIVE  UNROLL
     31% ( 113.53433 ms) : STREAM NORMAL
     30% ( 111.41691 ms) : STREAM UNROLL
     51% ( 190.70412 ms) : STORE  NORMAL
     51% ( 189.15338 ms) : STORE  UNROLL
     51% ( 189.36182 ms) : STORE  PREFET

那么我们从中学到了什么？ memset_pattern4快得令人难以置信。我包括了沼泽标准memset，尽管它只使用1字节模式进行比较。实质上，memset作弊，但memset_pattern4没有作弊，而且它仍然是邪恶的。

我已经尝试在程序集中查看我认为是OS X字符串库中memset_pattern4的源代码：

Apple的libc，memset_pattern4：http://www.opensource.apple.com/source/Libc/Libc-825.25/string/memset_pattern.c?txt
这引用了所谓的bcopy函数。让我们挖掘一下：字符串库：http://www.opensource.apple.com/source/Libc/Libc-763.13/x86_64/string/
在我的案例中，最有可能使用SSE 4.2版本： http://www.opensource.apple.com/source/Libc/Libc-763.13/x86_64/string/bcopy_sse42.s

我对asm的了解（到现在为止）足够远，以至于我看到他们在重要的movdqa部分使用LAlignedLoop指令，这基本上是一个SSE的举动整数指令（非浮点数），内在：_mm_store_si128。这不应该在这里，比特和字节，对吗？

memset_pattern4似乎还有一个纯粹的asm实现，它看起来有所不同，因为它没有调用bcopy：http://www.opensource.apple.com/source/Libc/Libc-763.13/x86_64/string/memset.s（ EDIT ：这是正确的，通过在gdb下运行验证）

...该死的，这个似乎使用非时间（_mm_stream_ps存储用于非常长的数组=＆gt; movntdq %xmm0,(%rdi,%rcx)...，查看函数的LVeryLong部分，这正是我的工作！那怎么能更快呢？也许那不是memset_pattern4我正在寻找的。

那么，memset_pattern4在幕后做什么，为什么它比我最好的尝试快5倍？尽管我一直在努力学习足够的x86程序集能够剖析这个功能我担心它现在有点不在我的联盟中来调试优化到死亡功能中的性能问题。

注意：对于那些好奇的人来说，这个微基准测试还可以说明clang及其高级矢量化（-fslp-vectorize）的绝对优秀，它设法使天真循环成为最快的一个保存对于几乎所有情况下的memset。它似乎与_mm_store_ps和_mm_stream_ps的最佳组合一样好。

CODE ：这是我用来执行基准测试的代码（如gist：https://gist.github.com/6571379）：

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>

/**
 * compile and run:
 *
 * OSX:
 *    clang stream.c -o stream -std=c11 -O3 -g -ftree-vectorize -fslp-vectorize -march=native -minline-all-stringops && ./stream
 *    gcc-4.8 stream.c -o stream -std=c11 -O3 -g3 -ftree-vectorize -march=native -minline-all-stringops && ./stream
 *
 * linux:
 *    clang stream.c -o stream -lrt -std=c11 -O3 -ftree-vectorize -fslp-vectorize -march=native && ./stream
 *    gcc-4.8 stream.c -o stream -lrt -std=c11 -O3 -ftree-vectorize -march=native && ./stream
 *
 * to generate the assembly:
 *    gcc-4.8 -S stream.c -o stream.s -std=c11 -O3 -g3 -ftree-vectorize -march=native -minline-all-stringops
 *    gobjdump -dS stream > stream.obj.s
 *
 * clang is the (very clear) winner here, the SLP vectorizer is absolutely killer, it even turns the
 * plain naive loop into something hyper-performant
 */

/* posix headers */
#include <sys/time.h>

/* intrinsics */
#include <x86intrin.h>

#define ARRAY_SIZE(x) ((sizeof(x)/sizeof(0[x])) / ((size_t)(!(sizeof(x) % sizeof(0[x])))))


/**
 * some stats from my system
 *
 * sudo sysctl -a | grep cache
 *
 * hw.cachelinesize = 64
 * hw.l1icachesize = 32768
 * hw.l1dcachesize = 32768
 * hw.l2cachesize = 262144
 * hw.l3cachesize = 6291456
 */

/* most processors these days (2013) have a 64 byte cache line */
#define FACTOR          1024
#define CACHE_LINE      64
#define FLOATS_PER_LINE (CACHE_LINE / sizeof(float))
#define L1_CACHE_BYTES  32768
#define L2_CACHE_BYTES  262144
#define L3_CACHE_BYTES  6291456


#ifdef __MACH__
#include <mach/mach_time.h>

double ns_conversion_factor;
double us_conversion_factor;
double ms_conversion_factor;

void timeinit() {
    mach_timebase_info_data_t timebase;
    mach_timebase_info(&timebase);

    ns_conversion_factor = (double)timebase.numer / (double)timebase.denom;
    us_conversion_factor = (double)timebase.numer / (double)timebase.denom / 1000;
    ms_conversion_factor = (double)timebase.numer / (double)timebase.denom / 1000000;
}

double nsticks() {
    return mach_absolute_time() * ns_conversion_factor;
}

double msticks() {
    return mach_absolute_time() * ms_conversion_factor;
}

#else

void timeinit() {
    /* do nothing */
}

double nsticks() {
    timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);

    return ((double)ts.tv_sec) / 1000000000 + ((double)ts.tv_nsec);
}

double msticks() {
    timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);

    return ((double)ts.tv_sec) / 1000 + ((double)ts.tv_nsec) * 1000000;
}

#endif


void *aligned_malloc(size_t size, size_t alignment) {
    void *pa, *ptr;

    pa = malloc((size+alignment-1)+sizeof(void *));
    if (!pa) return NULL;

    ptr=(void*)( ((intptr_t)pa+sizeof(void *)+alignment-1)&~(alignment-1) );
    *((void **)ptr-1)=pa;

    return ptr;
}

void aligned_free(void *ptr) {
    if (ptr) free(*((void **)ptr-1));
}

void pollute_cache(uint8_t volatile *arr, size_t length) {
    for (int i = 0; i < length; ++i) {
        arr[i] = (arr[i] > 0xFE) ? 0xAA : 0x55;
    }
}

void pollute_cache_standalone() {
    const size_t pollute_len = 2 * L3_CACHE_BYTES;
    uint8_t *arr             = aligned_malloc(pollute_len * sizeof(uint8_t), 64);

    for (int i = 0; i < pollute_len; ++i) {
        arr[i] = (arr[i] > 0xFE) ? 0xAA : 0x55;
    }

    aligned_free(arr);
}

/**
 * returns the time passed, in milliseconds
 */
double tim(const char *name, double baseline, void (*pre)(void), void (*func)(float *, size_t), float * restrict arr, size_t length) {
    struct timeval t1, t2;

    if (pre) pre();

    const double ms1 = msticks();
    func(arr, length);
    const double ms2 = msticks();

    const double ms = (ms2 - ms1);

    if (baseline == -2.0) return ms;

    /* first run, equal to baseline (itself) by definition */
    if (baseline == -1.0) baseline = ms;

    if (baseline != 0.0) {
        fprintf(stderr, "%7.0f%% (%10.5f ms) : %s\n", (ms / baseline) * 100, ms, name);
    }
    else {
        fprintf(stderr, "%7.3f ms : %s\n", ms, name);
    }

    return ms;
}

void func0(float * const restrict arr, size_t length) {
    memset(arr, 0x05, length);
}

#ifdef __MACH__

void funcB(float * const restrict arr, size_t length) {
    const float val = 5.0f;
    memset_pattern4(arr, &val,length);
}

#endif

void func1(float * const restrict arr, size_t length) {
    for (int i = 0; i < length; ++i) {
        arr[i] = 5.0f;
    }
}

void func2(float * const restrict arr, size_t length) {
    for(int i = 0; i < length; i += 4) {
        arr[i]   = 5.0f;
        arr[i+1] = 5.0f;
        arr[i+2] = 5.0f;
        arr[i+3] = 5.0f;
    }
}

void func3(float * const restrict arr, size_t length) {
    const __m128 buf = _mm_setr_ps(5.0f, 5.0f, 5.0f, 5.0f);

    for (int i = 0; i < length; i += 4) {
        _mm_stream_ps(&arr[i], buf);
    }

    _mm_mfence();
}

void func4(float * const restrict arr, size_t length) {
    const __m128 buf = _mm_setr_ps(5.0f, 5.0f, 5.0f, 5.0f);

    for (int i = 0; i < length; i += 16) {
        _mm_stream_ps(&arr[i + 0], buf);
        _mm_stream_ps(&arr[i + 4], buf);
        _mm_stream_ps(&arr[i + 8], buf);
        _mm_stream_ps(&arr[i + 12], buf);
    }

    _mm_mfence();
}

void func5(float * const restrict arr, size_t length) {
    const __m128 buf = _mm_setr_ps(5.0f, 5.0f, 5.0f, 5.0f);

    for (int i = 0; i < length; i += 4) {
        _mm_store_ps(&arr[i], buf);
    }
}

void fstore_prefetch(float * const restrict arr, size_t length) {
    const __m128 buf = _mm_setr_ps(5.0f, 5.0f, 5.0f, 5.0f);

    for (int i = 0; i < length; i += 16) {
        __builtin_prefetch(&arr[i + FLOATS_PER_LINE * 32], 1, 0);
        _mm_store_ps(&arr[i + 0], buf);
        _mm_store_ps(&arr[i + 4], buf);
        _mm_store_ps(&arr[i + 8], buf);
        _mm_store_ps(&arr[i + 12], buf);
    }
}

void func6(float * const restrict arr, size_t length) {
    const __m128 buf = _mm_setr_ps(5.0f, 5.0f, 5.0f, 5.0f);

    for (int i = 0; i < length; i += 16) {
        _mm_store_ps(&arr[i + 0], buf);
        _mm_store_ps(&arr[i + 4], buf);
        _mm_store_ps(&arr[i + 8], buf);
        _mm_store_ps(&arr[i + 12], buf);
    }
}

#ifdef __AVX__

void func7(float * restrict arr, size_t length) {
    const __m256 buf = _mm256_setr_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f);

    for (int i = 0; i < length; i += 8) {
        _mm256_stream_ps(&arr[i], buf);
    }
}

void func8(float * restrict arr, size_t length) {
    const __m256 buf = _mm256_setr_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f);

    for (int i = 0; i < length; i += 32) {
        _mm256_stream_ps(&arr[i + 0], buf);
        _mm256_stream_ps(&arr[i + 8], buf);
        _mm256_stream_ps(&arr[i + 16], buf);
        _mm256_stream_ps(&arr[i + 24], buf);
    }
}

void func9(float * restrict arr, size_t length) {
    const __m256 buf = _mm256_setr_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f);

    for (int i = 0; i < length; i += 8) {
        _mm256_store_ps(&arr[i], buf);
    }
}

void funcA(float * restrict arr, size_t length) {
    const __m256 buf = _mm256_setr_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f);

    for (int i = 0; i < length; i += 32) {
        _mm256_store_ps(&arr[i + 0], buf);
        _mm256_store_ps(&arr[i + 8], buf);
        _mm256_store_ps(&arr[i + 16], buf);
        _mm256_store_ps(&arr[i + 24], buf);
    }
}

#endif

void bench(const char * restrict name, float * restrict arr, size_t length) {
    fprintf(stderr, "bench %s, array %zu bytes (%zu floats, %zu remainder, %p pointer)\n", name, length, length / sizeof(float), length % sizeof(float), arr);

    size_t nfloats = length / sizeof(float);

    fprintf(stderr, "warm up round...");
    func1(arr, nfloats);
    fprintf(stderr, "done\n");

    double baseline = tim("func1: NAIVE ", -2.0, NULL, func1, arr, nfloats);

    tim("MEMSET CHEAT ", baseline, NULL, func0, arr, nfloats);
#ifdef __MACH__
    tim("MEMSET PATTER", baseline, NULL, funcB, arr, nfloats);
#endif
    tim("NAIVE  NORMAL", -1.0, NULL, func1, arr, nfloats);
    tim("NAIVE  UNROLL", baseline, NULL, func2, arr, nfloats);
    tim("STREAM NORMAL", baseline, NULL, func3, arr, nfloats);
    tim("STREAM UNROLL", baseline, NULL, func4, arr, nfloats);
    tim("STORE  NORMAL", baseline, NULL, func5, arr, nfloats);
    tim("STORE  UNROLL", baseline, NULL, func6, arr, nfloats);
    tim("STORE  PREFET", baseline, NULL, fstore_prefetch, arr, nfloats);

    // for (int i = 0; i < 1; ++i) {
    //     tim("func0: MEMSET (cache polluted)", NULL, func0, arr, nfloats);
    //     tim("func1: NAIVE  (cache polluted)", pollute_cache_standalone, func1, arr, nfloats);
    //     tim("func2: UNROLL (cache polluted)", pollute_cache_standalone, func2, arr, nfloats);
    //     tim("func3: STREAM (cache polluted)", pollute_cache_standalone, func3, arr, nfloats);
    //     tim("func4: STRUN  (cache polluted)", pollute_cache_standalone, func4, arr, nfloats);
    //     tim("func5: STORE  (cache polluted)", pollute_cache_standalone, func5, arr, nfloats);
    //     tim("func6: STOUN  (cache polluted)", pollute_cache_standalone, func6, arr, nfloats);
    // }
}

int main() {
    timeinit();

    static const struct {
        const char *name;
        size_t bytes;
    } sizes[] = {
        { "L1-HALF", L1_CACHE_BYTES / 2 },
        { "L1-FULL", L1_CACHE_BYTES },
        { "L2-HALF", L2_CACHE_BYTES / 2 },
        { "L2-FULL", L2_CACHE_BYTES },
        { "L3-HALF", L3_CACHE_BYTES / 2 },
        { "L3-FULL", L3_CACHE_BYTES },
        { "L3-DOUB", L3_CACHE_BYTES * 2 },
        { "L3-HUGE", L3_CACHE_BYTES * 64 },
        { "L3-MASS", L3_CACHE_BYTES * 256 }
    };

    for (int i = 0; i < ARRAY_SIZE(sizes); ++i) {
        size_t bytes = sizes[i].bytes;

        /* align to cache line */
        float *arr = aligned_malloc(bytes, CACHE_LINE);

        bench(sizes[i].name, arr, bytes);

        aligned_free(arr);
    }

    return 0;
}

编辑：我进一步深入挖掘并编辑了gcc生成的程序集，使其与苹果使用的程序集大致相同（memset.s，标记{{1 ，即：紧密循环中的4个展开的LVeryLong指令）。令我惊讶的是，我获得与使用movntdq（_mm_store_ps）的函数相同的性能。这让我感到困惑，正如我原本预料的那样

与movaps一样快（可能已展开memset_pattern4）
与展开movntdq（_mm_stream_ps）

但不，它似乎与movntdq相同，想象一下，也许我做错了什么。在生成的二进制文件上运行objdump确认它正在使用_mm_store_ps，这让我更加惊讶，到底是怎么回事？

因为我在那里遇到了死胡同，所以我决定在调试器中逐步执行可执行文件并在movntdq设置断点。走进这个函数，我注意到它完全按照我的想法行事，一个四个展开的紧密循环memset_pattern4：

movntdq

那么，是什么让Apple＆＃39的酱汁比我的更加神奇，我想......

编辑2 ：我错了两次，Apple的魔法酱并不是那么神奇，我只是传递了一个比我传给我的函数小4倍的数组。感谢@PaulR注意！其次我正在编辑函数的程序集，但是gcc已经内联它了。所以我正在编辑一份从未使用过的副本。

结论：

我发现了其他一些事情：

Clang和gcc非常好，有了正确的内在函数，它们可以优化很多（当启用SLP矢量化时，在没有内在函数的情况下，clang甚至做得很好）。它们还将内联函数指针。
Clang会将一个带有常量的朴素循环替换为0x00007fff92a5f7d2 <+318>: jmp 0x7fff92a5f7e0 <memset_pattern4+332> 0x00007fff92a5f7d4 <+320>: nopw 0x0(%rax,%rax,1) 0x00007fff92a5f7da <+326>: nopw 0x0(%rax,%rax,1) 0x00007fff92a5f7e0 <+332>: movntdq %xmm0,(%rdi,%rcx,1) 0x00007fff92a5f7e5 <+337>: movntdq %xmm0,0x10(%rdi,%rcx,1) 0x00007fff92a5f7eb <+343>: movntdq %xmm0,0x20(%rdi,%rcx,1) 0x00007fff92a5f7f1 <+349>: movntdq %xmm0,0x30(%rdi,%rcx,1) 0x00007fff92a5f7f7 <+355>: add $0x40,%rcx => 0x00007fff92a5f7fb <+359>: jne 0x7fff92a5f7e0 <memset_pattern4+332> 0x00007fff92a5f7fd <+361>: sfence调用，从而清除了我的另一个令人困惑的结果。
非临时存储（即：stream）仅对大量写入有益
memset非常优化，它会根据要写入的数组的长度自动在常规存储和非临时存储（流）之间切换。我不确定在OSX以外的平台上有多少是真的
在编写基准测试时，请确保函数执行您认为的功能，并且编译器并不能超越您。第一个案例是我的问题，我没有提供正确的论点。

编辑：我最近偶然发现了intel optimization guide，如果对这些事情感兴趣的话，请先读一下这部分内容（也许是从3.7.6开始）。

Answer 1

我认为你有几个错误：

void func0(float * const restrict arr, size_t length) {
    memset(arr, 0x05, length);
}

同样在这里：

void funcB(float * const restrict arr, size_t length) {
    const float val = 5.0f;
    memset_pattern4(arr, &val,length);
}

这些应该是：

void func0(float * const restrict arr, size_t length) {
    memset(arr, 0x05, length * sizeof(float));
}

和

void funcB(float * const restrict arr, size_t length) {
    const float val = 5.0f;
    memset_pattern4(arr, &val, length * sizeof(float));
}

对于这两种情况，这将给出比它应该更加乐观的时间。

在我3岁的Core i7 MacBook Pro（8 GB RAM）上，固定代码给了我：

bench L3-HUGE, array 402653184 bytes (100663296 floats, 0 remainder, 0x108ed8040 pointer)
warm up round...done
     99% (  69.43037 ms) : MEMSET CHEAT 
    106% (  73.98113 ms) : MEMSET PATTER
    100% (  72.40429 ms) : NAIVE  NORMAL
    120% (  83.98352 ms) : NAIVE  UNROLL
    102% (  71.75789 ms) : STREAM NORMAL
    102% (  71.59420 ms) : STREAM UNROLL
    115% (  80.63817 ms) : STORE  NORMAL
    123% (  86.58758 ms) : STORE  UNROLL
    123% (  86.22740 ms) : STORE  PREFET
bench L3-MASS, array 1610612736 bytes (402653184 floats, 0 remainder, 0x108ed8040 pointer)
warm up round...done
     83% ( 274.71955 ms) : MEMSET CHEAT 
     83% ( 275.19793 ms) : MEMSET PATTER
    100% ( 272.21942 ms) : NAIVE  NORMAL
     94% ( 309.73151 ms) : NAIVE  UNROLL
     82% ( 271.38751 ms) : STREAM NORMAL
     82% ( 270.27244 ms) : STREAM UNROLL
     94% ( 308.49498 ms) : STORE  NORMAL
     94% ( 308.72266 ms) : STORE  UNROLL
     95% ( 311.64157 ms) : STORE  PREFET

击败或满足OS X memset（和memset_pattern4）

1 个答案: