我正在使用_mm_stream_ps内在函数,我在理解其性能方面遇到了一些麻烦。
这是我正在使用的代码段... 流版本:
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <omp.h>
#include <immintrin.h>
#define NUM_ELEMENTS 10000000L
static void copy_temporal(float* restrict x, float* restrict y)
{
for(uint64_t i = 0; i < NUM_ELEMENTS/2; ++i){
_mm_store_ps(y,_mm_load_ps(x));
_mm_store_ps(y+4,_mm_load_ps(x+4));
x+=8;
y+=8;
}
}
static void copy_nontemporal(float* restrict x, float* restrict y)
{
for(uint64_t i = 0; i < NUM_ELEMENTS/2; ++i){
_mm_stream_ps(y,_mm_load_ps(x));
_mm_stream_ps(y+4,_mm_load_ps(x+4));
x+=8;
y+=8;
}
}
int main(int argc, char** argv)
{
uint64_t sizeX = sizeof(float) * 4 * NUM_ELEMENTS;
float *x = (float*) _mm_malloc(sizeX,32);
float *y = (float*) _mm_malloc(sizeX,32);
//initialization
for(uint64_t i = 0 ; i < 4 * NUM_ELEMENTS; ++i){
x[i] = (float)rand()/RAND_MAX;
y[i] = 0;
}
printf("%g MB allocated\n",(2 * sizeX)/1024.0/1024.0);
double start = omp_get_wtime();
copy_nontemporal(x, y);
double time = omp_get_wtime() - start;
printf("Bandwidth (non-temporal): %g GB/s\n",((3 * sizeX)/1024.0/1024.0/1024.0)/time);
start = omp_get_wtime();
copy_temporal(x, y);
time = omp_get_wtime() - start;
printf("Bandwidth: %g GB/s\n",((3 * sizeX)/1024.0/1024.0/1024.0)/time);
_mm_free(x);
_mm_free(y);
return 0;
}
表现结果:
2.3 GHz Core i7 (I7-3615QM) (Laptop):
305.176 MB allocated
Bandwidth (non-temporal): 24.2242 GB/s
Bandwidth: 21.4136 GB/s
Xeon(R) CPU E5-2650 0 @ 2.00GHz (cluster (exclusive job)):
305.176 MB allocated
Bandwidth (non-temporal): 8.33133 GB/s
Bandwidth: 8.20684 GB/s
让我感到困惑的是,我看到更好的性能 - 在Xeon CPU上(不在我的笔记本电脑上) - 如果我使用非对齐的加载和存储(即storeu_ps / loadu_ps):
305.176 MB allocated
Bandwidth (non-temporal): 8.30105 GB/s
Bandwidth: 12.7056 GB/s
由于y的冗余负载,我希望流版本比非流版本更快。但是,测量显示流版本实际上比非流版本慢两倍。
你对此有任何解释吗?
使用的编译器:Intel 14.0.1; 编译器标志:-O3 -restrict -xAVX; 使用的CPU:Intel Xeon E5-2650;
谢谢。
答案 0 :(得分:3)
流变化会直接向DRAM创建流水线突发写入。速度应大致与DRAM的速度相匹配。标准存储写入缓存(但如果数据尚未存在于缓存中,则首先将其读入缓存)。如果数据已在缓存中,则标准存储以高速缓存写入的速度运行。通常,使用stream方法,大小远远大于最后一级高速缓存大小的写入要快得多。使用标准商店,小写通常更快。尝试使用几GB的缓冲区运行测试。流方法应该更快。
以下是展示的基准:
#define __USE_MINGW_ANSI_STDIO 1
#include <stdlib.h>
#include <intrin.h>
#include <windows.h>
#include <stdio.h>
#include <stdint.h>
//-----------------------------------------------------------------------------
//
// queryPerformanceCounter - similar to QueryPerformanceCounter, but returns
// count directly.
uint64_t queryPerformanceCounter (void)
{
LARGE_INTEGER int64;
QueryPerformanceCounter (&int64);
return int64.QuadPart;
}
//-----------------------------------------------------------------------------
//
// queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns count direcly.
uint64_t queryPerformanceFrequency (void)
{
LARGE_INTEGER int64;
QueryPerformanceFrequency (&int64);
return int64.QuadPart;
}
//---------------------------------------------------------------------------
static void testNontemporal (float *x, float *y, uint64_t numberOfVectors)
{
uint64_t i;
for(i = 0; i < numberOfVectors / 2; ++i)
{
_mm_stream_ps(y,_mm_load_ps(x));
_mm_stream_ps(y+4,_mm_load_ps(x+4));
y+=8; x+=8;
}
}
//---------------------------------------------------------------------------
static void testTemporal (float *x, float *y, uint64_t numberOfVectors)
{
uint64_t i;
for(i = 0; i < numberOfVectors / 2; ++i)
{
_mm_store_ps(y,_mm_load_ps(x));
_mm_store_ps(y+4,_mm_load_ps(x+4));
y+=8; x+=8;
}
}
//---------------------------------------------------------------------------
static void runtests (int nonTemporal)
{
uint64_t startCount, elapsed, index;
float *x, *y;
uint64_t numberOfBytes = 400 * 0x100000ull;
uint64_t numberOfFloats = numberOfBytes / sizeof *x;
uint64_t numberOfVectors = numberOfFloats / 4;
double gbPerSecond;
x = _mm_malloc (numberOfBytes, 32);
y = _mm_malloc (numberOfBytes, 32);
if (x == NULL || y == NULL) exit (1);
// put valid floating point data into the source buffer
// to avoid performance penalty
for (index = 0; index < numberOfFloats; index++)
x [index] = (float) index, y [index] = 0;
startCount = queryPerformanceCounter ();
if (nonTemporal)
testNontemporal (x, y, numberOfVectors);
else
testTemporal (x, y, numberOfVectors);
elapsed = queryPerformanceCounter () - startCount;
gbPerSecond = (double) numberOfBytes / 0x40000000 * queryPerformanceFrequency () / elapsed;
printf ("%.2f GB/s\n", gbPerSecond);
_mm_free (x);
_mm_free (y);
}
//---------------------------------------------------------------------------
int main (void)
{
// raise our priority to increase measurement accuracy
SetPriorityClass (GetCurrentProcess (), REALTIME_PRIORITY_CLASS);
printf ("using temporal stores\n");
runtests (0);
printf ("using non-temporal stores\n");
runtests (1);
return 0;
}
//---------------------------------------------------------------------------
英特尔酷睿i7-2600K的输出:
using temporal stores
5.57 GB/s
using non-temporal stores
8.35 GB/s
答案 1 :(得分:1)
AFAIK,非临时存储从所有缓存中删除目标缓存行。如果线条在被自然掉落之前再次被触摸,那么你就会非常努力地失去它。
答案 2 :(得分:0)
正如ScottD指出的那样,问题的答案在于生成的汇编代码。 显然,英特尔编译器足够智能,可以检测访问模式,并自动生成非临时负载,即使是时态版本。
以下是编译器生成的时态版汇编代码:
..___tag_value___Z13copy_temporalPfS_.35: #
xor edx, edx #22.4
xor eax, eax #
..B2.2: # Preds ..B2.2 ..B2.1
vmovups xmm0, XMMWORD PTR [rax+rdi] #23.34
inc rdx #22.4
vmovntps XMMWORD PTR [rax+rsi], xmm0 #23.20
vmovups xmm1, XMMWORD PTR [16+rax+rdi] #24.36
vmovntps XMMWORD PTR [16+rax+rsi], xmm1 #24.20
vmovups xmm2, XMMWORD PTR [32+rax+rdi] #23.34
vmovntps XMMWORD PTR [32+rax+rsi], xmm2 #23.20
vmovups xmm3, XMMWORD PTR [48+rax+rdi] #24.36
vmovntps XMMWORD PTR [48+rax+rsi], xmm3 #24.20
add rax, 64 #22.4
cmp rdx, 5000000 #22.4
jb ..B2.2 # Prob 99% #22.4
仍然存在的问题如下:
为什么非对齐的时态版本比CPU E5-2650的非时态版本表现更好(见上文)。我已经查看了生成的汇编代码,编译器确实生成了vmovups指令(由于不存在的对齐)。