Question

任何人都可以给出一个示例或链接到一个在GCC中使用__builtin_prefetch的示例（或者通常只是asm指令prefetcht0）以获得实质性的性能优势吗？特别是，我希望这个例子符合以下标准：

这是一个简单，小巧，独立的例子。
删除__builtin_prefetch指令会导致性能下降。
使用相应的内存访问替换__builtin_prefetch指令会导致性能下降。

也就是说，我想要一个显示__builtin_prefetch执行优化的最短示例，如果没有它就无法管理。

Answer 1

这是我从一个更大的项目中提取的一段实际代码。（对不起，这是我能找到的最短的一个，它具有预取的显着加速。）此代码执行非常大的数据转置。

此示例使用SSE预取指令，该指令可能与GCC发出的指令相同。

要运行此示例，您需要为x64编译此内容并具有超过4GB的内存。您可以使用较小的数据量运行它，但时间太快。

#include <iostream>
using std::cout;
using std::endl;

#include <emmintrin.h>
#include <malloc.h>
#include <time.h>
#include <string.h>

#define ENABLE_PREFETCH


#define f_vector    __m128d
#define i_ptr       size_t
inline void swap_block(f_vector *A,f_vector *B,i_ptr L){
    //  To be super-optimized later.

    f_vector *stop = A + L;

    do{
        f_vector tmpA = *A;
        f_vector tmpB = *B;
        *A++ = tmpB;
        *B++ = tmpA;
    }while (A < stop);
}
void transpose_even(f_vector *T,i_ptr block,i_ptr x){
    //  Transposes T.
    //  T contains x columns and x rows.
    //  Each unit is of size (block * sizeof(f_vector)) bytes.

    //Conditions:
    //  - 0 < block
    //  - 1 < x

    i_ptr row_size = block * x;
    i_ptr iter_size = row_size + block;

    //  End of entire matrix.
    f_vector *stop_T = T + row_size * x;
    f_vector *end = stop_T - row_size;

    //  Iterate each row.
    f_vector *y_iter = T;
    do{
        //  Iterate each column.
        f_vector *ptr_x = y_iter + block;
        f_vector *ptr_y = y_iter + row_size;

        do{

#ifdef ENABLE_PREFETCH
            _mm_prefetch((char*)(ptr_y + row_size),_MM_HINT_T0);
#endif

            swap_block(ptr_x,ptr_y,block);

            ptr_x += block;
            ptr_y += row_size;
        }while (ptr_y < stop_T);

        y_iter += iter_size;
    }while (y_iter < end);
}
int main(){

    i_ptr dimension = 4096;
    i_ptr block = 16;

    i_ptr words = block * dimension * dimension;
    i_ptr bytes = words * sizeof(f_vector);

    cout << "bytes = " << bytes << endl;
//    system("pause");

    f_vector *T = (f_vector*)_mm_malloc(bytes,16);
    if (T == NULL){
        cout << "Memory Allocation Failure" << endl;
        system("pause");
        exit(1);
    }
    memset(T,0,bytes);

    //  Perform in-place data transpose
    cout << "Starting Data Transpose...   ";
    clock_t start = clock();
    transpose_even(T,block,dimension);
    clock_t end = clock();

    cout << "Done" << endl;
    cout << "Time: " << (double)(end - start) / CLOCKS_PER_SEC << " seconds" << endl;

    _mm_free(T);
    system("pause");
}

当我在启用ENABLE_PREFETCH的情况下运行它时，这是输出：

bytes = 4294967296
Starting Data Transpose...   Done
Time: 0.725 seconds
Press any key to continue . . .

当我在禁用ENABLE_PREFETCH的情况下运行它时，这是输出：

bytes = 4294967296
Starting Data Transpose...   Done
Time: 0.822 seconds
Press any key to continue . . .

因此预取的速度提高了13％。

编辑：

这里有更多结果：

Operating System: Windows 7 Professional/Ultimate
Compiler: Visual Studio 2010 SP1
Compile Mode: x64 Release

Intel Core i7 860 @ 2.8 GHz, 8 GB DDR3 @ 1333 MHz
Prefetch   : 0.868
No Prefetch: 0.960

Intel Core i7 920 @ 3.5 GHz, 12 GB DDR3 @ 1333 MHz
Prefetch   : 0.725
No Prefetch: 0.822

Intel Core i7 2600K @ 4.6 GHz, 16 GB DDR3 @ 1333 MHz
Prefetch   : 0.718
No Prefetch: 0.796

2 x Intel Xeon X5482 @ 3.2 GHz, 64 GB DDR2 @ 800 MHz
Prefetch   : 2.273
No Prefetch: 2.666

Answer 2

Binary search is a simple example that could benefit from explicit prefetching. The access pattern in a binary search looks pretty much random to the hardware prefetcher, so there is little chance that it will accurately predict what to fetch.

In this example, I prefetch the two possible 'middle' locations of the next loop iteration in the current iteration. One of the prefetches will probably never be used, but the other will (unless this is the final iteration).

 #include <time.h>
 #include <stdio.h>
 #include <stdlib.h>

 int binarySearch(int *array, int number_of_elements, int key) {
         int low = 0, high = number_of_elements-1, mid;
         while(low <= high) {
                 mid = (low + high)/2;
            #ifdef DO_PREFETCH
            // low path
            __builtin_prefetch (&array[(mid + 1 + high)/2], 0, 1);
            // high path
            __builtin_prefetch (&array[(low + mid - 1)/2], 0, 1);
            #endif

                 if(array[mid] < key)
                         low = mid + 1; 
                 else if(array[mid] == key)
                         return mid;
                 else if(array[mid] > key)
                         high = mid-1;
         }
         return -1;
 }
 int main() {
     int SIZE = 1024*1024*512;
     int *array =  malloc(SIZE*sizeof(int));
     for (int i=0;i<SIZE;i++){
       array[i] = i;
     }
     int NUM_LOOKUPS = 1024*1024*8;
     srand(time(NULL));
     int *lookups = malloc(NUM_LOOKUPS * sizeof(int));
     for (int i=0;i<NUM_LOOKUPS;i++){
       lookups[i] = rand() % SIZE;
     }
     for (int i=0;i<NUM_LOOKUPS;i++){
       int result = binarySearch(array, SIZE, lookups[i]);
     }
     free(array);
     free(lookups);
 }

When I compile and run this example with DO_PREFETCH enabled, I see a 20% reduction in runtime:

 $ gcc c-binarysearch.c -DDO_PREFETCH -o with-prefetch -std=c11 -O3
 $ gcc c-binarysearch.c -o no-prefetch -std=c11 -O3

 $ perf stat -e L1-dcache-load-misses,L1-dcache-loads ./with-prefetch 

  Performance counter stats for './with-prefetch':

    356,675,702      L1-dcache-load-misses     #   41.39% of all L1-dcache hits  
   861,807,382      L1-dcache-loads                                             

   8.787467487 seconds time elapsed

 $ perf stat -e L1-dcache-load-misses,L1-dcache-loads ./no-prefetch 

 Performance counter stats for './no-prefetch':

   382,423,177      L1-dcache-load-misses     #   97.36% of all L1-dcache hits  
   392,799,791      L1-dcache-loads                                             

  11.376439030 seconds time elapsed

Notice that we are doing twice as many L1 cache loads in the prefetch version. We're actually doing a lot more work but the memory access pattern is more friendly to the pipeline. This also shows the tradeoff. While this block of code runs faster in isolation, we have loaded a lot of junk into the caches and this may put more pressure on other parts of the application.

Answer 3

我从@JamesScriven和@Mystical提供的优秀答案中学到了很多东西。然而，他们的例子只给予了适度的提升 - 这个答案的目的是提出一个（我必须承认有点人为）的例子，其中预取具有更大的影响（在我的机器上大约是因子4）。

现代架构有三种可能的瓶颈：CPU速度，内存带宽和内存延迟。预取就是减少内存访问的延迟。

在一个完美的场景中，延迟对应于X计算步骤，我们会有一个oracle，告诉我们在X计算步骤中我们将访问哪些内存，这些数据的预取将被启动并且它将到达只是及时X计算步骤。

对于很多算法，我们（几乎）在这个完美的世界中。对于简单的for循环，很容易预测X步之后需要哪些数据。乱序执行和其他硬件技巧在这里做得非常好，几乎完全隐藏了延迟。

这就是为什么@Mystical的例子有这么小的改进的原因：预取器已经相当不错 - 没有太大的改进空间。任务也受内存限制，因此可能没有多少带宽 - 它可能成为限制因素。我的机器最多可以看到8％的改进。

来自@JamesScriven示例的重要见解：在从内存中获取当前数据之前，我们和CPU都不知道下一个访问地址 - 这种依赖非常重要，否则无序执行将导致前瞻，硬件可以预取数据。但是，因为我们只能推测一步没有那么大的潜力。我的机器无法超过40％。

因此，让我们以竞争对手的方式准备数据，以便我们知道在X步骤中访问哪个地址，但由于依赖于尚未访问的数据，硬件无法找到它（在答案结尾看到整个程序）：

//making random accesses to memory:
unsigned int next(unsigned int current){
   return (current*10001+328)%SIZE;
}

//the actual work is happening here
void operator()(){

    //set up the oracle - let see it in the future oracle_offset steps
    unsigned int prefetch_index=0;
    for(int i=0;i<oracle_offset;i++)
        prefetch_index=next(prefetch_index);

    unsigned int index=0;
    for(int i=0;i<STEP_CNT;i++){
        //use oracle and prefetch memory block used in a future iteration
        if(prefetch){
            __builtin_prefetch(mem.data()+prefetch_index,0,1);    
        }

        //actual work, the less the better
        result+=mem[index];

        //prepare next iteration
        prefetch_index=next(prefetch_index);  #update oracle
        index=next(mem[index]);               #dependency on `mem[index]` is VERY important to prevent hardware from predicting future
    }
}

一些评论：

以这样的方式准备数据，oracle总是正确的。
可能令人惊讶的是，受CPU限制的任务越少，加速越大：我们几乎可以完全隐藏延迟，因此加速为CPU-time+original-latency-time/CPU-time。

编译和执行潜在客户：

>>> g++ -std=c++11 prefetch_demo.cpp -O3 -o prefetch_demo
>>> ./prefetch_demo
#preloops   time no prefetch    time prefetch   factor
...
7   1.0711102260000001  0.230566831 4.6455521002498408
8   1.0511602149999999  0.22651144600000001 4.6406494398521474
9   1.049024333 0.22841439299999999 4.5926367389641687
....

加速到4到5之间。

prefetch_demp.cpp的列表：

//prefetch_demo.cpp

#include <vector>
#include <iostream>
#include <iomanip>
#include <chrono>

const int SIZE=1024*1024*1;
const int STEP_CNT=1024*1024*10;

unsigned int next(unsigned int current){
   return (current*10001+328)%SIZE;
}


template<bool prefetch>
struct Worker{
   std::vector<int> mem;

   double result;
   int oracle_offset;

   void operator()(){
        unsigned int prefetch_index=0;
        for(int i=0;i<oracle_offset;i++)
            prefetch_index=next(prefetch_index);

        unsigned int index=0;
        for(int i=0;i<STEP_CNT;i++){
            //prefetch memory block used in a future iteration
            if(prefetch){
                __builtin_prefetch(mem.data()+prefetch_index,0,1);    
            }
            //actual work:
            result+=mem[index];

            //prepare next iteration
            prefetch_index=next(prefetch_index);
            index=next(mem[index]);
        }
   }

   Worker(std::vector<int> &mem_):
       mem(mem_), result(0.0), oracle_offset(0)
   {}
};

template <typename Worker>
    double timeit(Worker &worker){
    auto begin = std::chrono::high_resolution_clock::now();
    worker();
    auto end = std::chrono::high_resolution_clock::now();
    return std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count()/1e9;
}


 int main() {
     //set up the data in special way!
     std::vector<int> keys(SIZE);
     for (int i=0;i<SIZE;i++){
       keys[i] = i;
     }

     Worker<false> without_prefetch(keys);
     Worker<true> with_prefetch(keys);

     std::cout<<"#preloops\ttime no prefetch\ttime prefetch\tfactor\n";
     std::cout<<std::setprecision(17);

     for(int i=0;i<20;i++){
         //let oracle see i steps in the future:
         without_prefetch.oracle_offset=i;
         with_prefetch.oracle_offset=i;

         //calculate:
         double time_with_prefetch=timeit(with_prefetch);
         double time_no_prefetch=timeit(without_prefetch);

         std::cout<<i<<"\t"
                  <<time_no_prefetch<<"\t"
                  <<time_with_prefetch<<"\t"
                  <<(time_no_prefetch/time_with_prefetch)<<"\n";
     }

 }

Answer 4

来自the documentation：

      for (i = 0; i < n; i++)
        {
          a[i] = a[i] + b[i];
          __builtin_prefetch (&a[i+j], 1, 1);
          __builtin_prefetch (&b[i+j], 0, 1);
          /* ... */
        }

Answer 5

预取数据可以优化为高速缓存行大小，对于大多数现代64位处理器而言，预取数据为64字节，例如，用一条指令预加载uint32_t [16]。

例如在ArmV8上，我通过实验发现将内存指针转换为uint32_t 4x4矩阵向量（大小为64字节），将所需的指令减少了一半，因为之前它只加载一半的数据，所以我不得不增加8 ，即使我的理解是它可以获取完整的缓存行。

预取uint32_t [32]原始代码示例...

int addrindex = &B[0];
    __builtin_prefetch(&V[addrindex]);
    __builtin_prefetch(&V[addrindex + 8]);
    __builtin_prefetch(&V[addrindex + 16]);
    __builtin_prefetch(&V[addrindex + 24]);

之后...

int addrindex = &B[0];
__builtin_prefetch((uint32x4x4_t *) &V[addrindex]);
__builtin_prefetch((uint32x4x4_t *) &V[addrindex + 16]);

由于某种原因，用于地址索引/偏移量的int数据类型提供了更好的性能。在Cortex-a53上用GCC 8测试。如果在其他体系结构上使用等效的64字节向量，可能会获得相同的性能改进，如果您发现它没有像我这样预取所有数据。在我的具有一百万次迭代循环的应用程序中，这样做可以将性能提高5％。还有进一步的改进要求。

128兆字节的“ V”内存分配必须与64字节对齐。

uint32_t *V __attribute__((__aligned__(64))) = (uint32_t *)(((uintptr_t)(__builtin_assume_aligned((unsigned char*)aligned_alloc(64,size), 64)) + 63) & ~ (uintptr_t)(63));

我还必须使用C运算符代替Neon Intrinsics，因为它们需要常规的数据类型指针（在我的情况下是uint32_t *），否则新的内置预取方法会导致性能下降。

我的真实示例可以在scrypt_core（）及其内部函数的https://github.com/rollmeister/veriumMiner/blob/main/algo/scrypt.c中找到，它们都很容易阅读。艰苦的工作由GCC8完成。总体性能提高了25％。

预取示例？

5 个答案: