在x86_64上使用gcc 4.4.5(是的......我知道它已经老了)。出于兼容性原因,仅限于SSE2(或更早)的说明。
我认为应该是一个教科书案例,可以从预取中获得巨大收益。我有一个32位元素的数组(" A"),它们不是(也可能不是)按顺序排列的。这些32位元素是__m128i数据的较大数据数组(" D")的索引。对于" A"的每个元素,我需要从" D"中的适当位置获取__m128i数据,对其执行操作,并将其存储回&中的相同位置。 #34; d&#34 ;.实际上每个"条目"在D中是" SOME_CONST" __m128i的大。因此,如果A中的值是" 1",则D中的索引是D [1 * SOME_CONST]。
由于" A"中的连续元素我几乎从不指向" D"中的连续位置,我倾向于认为硬件预取器会挣扎或无法完成任何有用的事情。
但是,我可以很容易地预测下一个我将要访问的位置,只需先看看" A"。足够的措辞......这里有一些代码。我对数据执行的操作是取__m128i的低64位并将其克隆到相同的高64位。首先是基本循环,没有多余的装饰......
// SOME_CONST is either 3 or 4, but this "operation" only needs to happen for 3
for ( i=0; i<arraySize; ++i )
{
register __m128i *dPtr = D + (A[i] * SOME_CONST);
dPtr[0] = _mm_shuffle_epi32( dPtr[0], 0 | (1<<2) | (0<<4) | (1<<6) );
dPtr[1] = _mm_shuffle_epi32( dPtr[1], 0 | (1<<2) | (0<<4) | (1<<6) );
dPtr[2] = _mm_shuffle_epi32( dPtr[2], 0 | (1<<2) | (0<<4) | (1<<6) );
// The immediate operand selects:
// Bits 0-31 = bits 0-31
// Bits 32-63 = bits 32-63
// Bits 64-95 = bits 0-31
// Bits 96-127 = bits 32-63
// If anyone is more clever than me and knows of a better way to do this in SSE2,
// bonus points. ;-)
}
我已经尝试了许多不同的方法来在那里撒上预取内容,但没有一种方法可以产生任何加速。例如,我尝试展开循环以获得2或甚至4个元素的步幅,但它没有帮助......
// Assume the "A" array size is appropriately padded so that overruns don't
// result in SIGSEGV accessing uninitialized memory in D.
register __m128i *dPtr0, *dPtr1, *dPtr2, *dPtr3, *dPtr4, *dPtr5, *dPtr6, *dPtr7;
dPtr4 = D + (A[0] * SOME_CONST);
dPtr5 = D + (A[1] * SOME_CONST);
dPtr6 = D + (A[2] * SOME_CONST);
dPtr7 = D + (A[3] * SOME_CONST);
for ( i=0; i<arraySize; i+=4 )
{
dPtr0 = dPtr4;
dPtr1 = dPtr5;
dPtr2 = dPtr6;
dPtr3 = dPtr7;
dPtr4 = D + (A[i+4] * SOME_CONST);
_mm_prefetch( dPtr4, _MM_HINT_NTA );
_mm_prefetch( dPtr4+1, _MM_HINT_NTA ); // Is it needed? Tried both ways
dPtr5 = D + (A[i+5] * SOME_CONST);
_mm_prefetch( dPtr5, _MM_HINT_NTA );
_mm_prefetch( dPtr5+1, _MM_HINT_NTA ); // Is it needed? Tried both ways
dPtr6 = D + (A[i+6] * SOME_CONST);
_mm_prefetch( dPtr6, _MM_HINT_NTA );
_mm_prefetch( dPtr6+1, _MM_HINT_NTA ); // Is it needed? Tried both ways
dPtr7 = D + (A[i+7] * SOME_CONST);
_mm_prefetch( dPtr7, _MM_HINT_NTA );
_mm_prefetch( dPtr7+1, _MM_HINT_NTA ); // Is it needed? Tried both ways
dPtr0[0] = _mm_shuffle_epi32( dPtr0[0], 0 | (1<<2) | (0<<4) | (1<<6) );
dPtr0[1] = _mm_shuffle_epi32( dPtr0[1], 0 | (1<<2) | (0<<4) | (1<<6) );
dPtr0[2] = _mm_shuffle_epi32( dPtr0[2], 0 | (1<<2) | (0<<4) | (1<<6) );
dPtr1[0] = _mm_shuffle_epi32( dPtr1[0], 0 | (1<<2) | (0<<4) | (1<<6) );
dPtr1[1] = _mm_shuffle_epi32( dPtr1[1], 0 | (1<<2) | (0<<4) | (1<<6) );
dPtr1[2] = _mm_shuffle_epi32( dPtr1[2], 0 | (1<<2) | (0<<4) | (1<<6) );
dPtr2[0] = _mm_shuffle_epi32( dPtr2[0], 0 | (1<<2) | (0<<4) | (1<<6) );
dPtr2[1] = _mm_shuffle_epi32( dPtr2[1], 0 | (1<<2) | (0<<4) | (1<<6) );
dPtr2[2] = _mm_shuffle_epi32( dPtr2[2], 0 | (1<<2) | (0<<4) | (1<<6) );
dPtr3[0] = _mm_shuffle_epi32( dPtr3[0], 0 | (1<<2) | (0<<4) | (1<<6) );
dPtr3[1] = _mm_shuffle_epi32( dPtr3[1], 0 | (1<<2) | (0<<4) | (1<<6) );
dPtr3[2] = _mm_shuffle_epi32( dPtr3[2], 0 | (1<<2) | (0<<4) | (1<<6) );
}
这是4个元素的版本,但我也尝试过只有2个,因为太多数据无法晃动。我也尝试使用_MM_HINT_NTA和_MM_HINT_T0。某种程度上没有明显的差异。
我还尝试了一个更简单的变体,它试图在预取和使用它之间放置尽可能合理的空间:
#define PREFETCH_DISTANCE 10
// trying 5 overnight, will see results tomorrow...
for ( i=0; i<arraySize; ++i )
{
register __m128i *dPtrFuture, *dPtr;
dPtrFuture = D + (A[i + PREFETCH_DISTANCE] * SOME_CONST);
_mm_prefetch( dPtrFuture, _MM_HINT_NTA ); // tried _MM_HINT_T0 too
_mm_prefetch( dPtrFuture + 1, _MM_HINT_NTA ); // tried _MM_HINT_T0 too
dPtr = D + (A[i] * SOME_CONST);
dPtr[0] = _mm_shuffle_epi32( dPtr[0], 0 | (1<<2) | (0<<4) | (1<<6) );
dPtr[1] = _mm_shuffle_epi32( dPtr[1], 0 | (1<<2) | (0<<4) | (1<<6) );
dPtr[2] = _mm_shuffle_epi32( dPtr[2], 0 | (1<<2) | (0<<4) | (1<<6) );
}
最初我希望这段代码能够停止,但是一旦它得到了'#34; PREFETCH_DISTANCE&#34;在循环中,我希望它会看到一个足够好的速度提升。大多数这些变体在数百万次迭代中导致不超过0.2秒的运行时间差异,这使得这个特定机器的总CPU时间为4m:30s(除了我之外,它是空闲的)。差异似乎与&#34;噪音&#34;无法区分。在数据中。
我是否认为预取应该可以帮助我?如果是这样,我做错了什么?
感谢所有有用和有趣的想法。
修改
我创建了一个真正随机化A中数据的人为例子。我使用的缓冲区大小从64MB到6400MB。我发现在展开循环并预先计算接下来的4个元素的地址时,我获得了巨大的收益,因为我正在对当前的4执行我的操作。但是如果我运行时,我的运行时间大于10倍尝试预取我预先计算的任何地址。我真的在这个问题上摸不着头脑。我的独立设计代码是:
#include <xmmintrin.h>
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#define QUEUE_ELEMENTS 1048576
#define DATA_ELEMENT_SIZE 4 * sizeof( __m128i )
#define DATA_ELEMENTS QUEUE_ELEMENTS
#define QUEUE_ITERATIONS 100000
#define LOOP_UNROLL_4
#define LOOP_UNROLL_2
#ifdef LOOP_UNROLL_4
#define UNROLL_CONST 4
#else
#ifdef LOOP_UNROLL_2
#define UNROLL_CONST 2
#else
#define UNROLL_CONST 1
#endif
#endif
int main( void )
{
unsigned long long randTemp;
unsigned long i, outerLoop;
unsigned long *workQueue;
__m128i *data, *dataOrig;
clock_t timeStamp;
workQueue = malloc( QUEUE_ELEMENTS * sizeof( unsigned long ) );
dataOrig = malloc( (DATA_ELEMENTS * DATA_ELEMENT_SIZE) + 2 );
if ( (unsigned long long) dataOrig & 0xf )
{
data = (__m128i *) (((unsigned long long) dataOrig & ~0xf) + 0x10);
// force 16-byte (128-bit) alignment
} else data = dataOrig;
// Not initializing data, because its contents isn't important.
for ( i=0; i<QUEUE_ELEMENTS; ++i )
{
randTemp = (unsigned long long)rand() *
(unsigned long long) QUEUE_ELEMENTS / (unsigned long long) RAND_MAX;
workQueue[i] = (unsigned long) randTemp;
}
printf( "Starting work...\n" );
// Actual work happening below... start counting.
timeStamp = clock();
for ( outerLoop = 0; outerLoop < QUEUE_ITERATIONS; ++outerLoop )
{
register __m128i *dataPtr0, *dataPtr1, *dataPtr2, *dataPtr3;
register __m128i *dataPtr4, *dataPtr5, *dataPtr6, *dataPtr7;
#ifdef LOOP_UNROLL_2
dataPtr4 = data + (workQueue[0] * DATA_ELEMENT_SIZE);
dataPtr5 = data + (workQueue[1] * DATA_ELEMENT_SIZE);
#endif
#ifdef LOOP_UNROLL_4
dataPtr6 = data + (workQueue[2] * DATA_ELEMENT_SIZE);
dataPtr7 = data + (workQueue[3] * DATA_ELEMENT_SIZE);
#endif
for ( i=0; i<QUEUE_ELEMENTS; i+=UNROLL_CONST )
{
#ifdef LOOP_UNROLL_2
dataPtr0 = dataPtr4;
dataPtr4 = data + (workQueue[i+4] * DATA_ELEMENT_SIZE);
// _mm_prefetch( dataPtr4, _MM_HINT_T0 );
dataPtr1 = dataPtr5;
dataPtr5 = data + (workQueue[i+5] * DATA_ELEMENT_SIZE);
// _mm_prefetch( dataPtr5, _MM_HINT_T0 );
#endif
#ifdef LOOP_UNROLL_4
dataPtr2 = dataPtr6;
dataPtr6 = data + (workQueue[i+6] * DATA_ELEMENT_SIZE);
// _mm_prefetch( dataPtr6, _MM_HINT_T0 );
dataPtr3 = dataPtr7;
dataPtr7 = data + (workQueue[i+7] * DATA_ELEMENT_SIZE);
// _mm_prefetch( dataPtr7, _MM_HINT_T0 );
#endif
#if !defined( LOOP_UNROLL_2 ) && !defined( LOOP_UNROLL_4 )
dataPtr0 = data + (workQueue[i] * DATA_ELEMENT_SIZE);
#endif
_mm_shuffle_epi32( dataPtr0[0], 0 | (1<<2) | (0<<4) | (1<<6) );
_mm_shuffle_epi32( dataPtr0[1], 0 | (1<<2) | (0<<4) | (1<<6) );
_mm_shuffle_epi32( dataPtr0[2], 0 | (1<<2) | (0<<4) | (1<<6) );
// Per original code, no need to perform operation on dataPtrx[3]
#ifdef LOOP_UNROLL_2
_mm_shuffle_epi32( dataPtr1[0], 0 | (1<<2) | (0<<4) | (1<<6) );
_mm_shuffle_epi32( dataPtr1[1], 0 | (1<<2) | (0<<4) | (1<<6) );
_mm_shuffle_epi32( dataPtr1[2], 0 | (1<<2) | (0<<4) | (1<<6) );
#endif
#ifdef LOOP_UNROLL_4
_mm_shuffle_epi32( dataPtr2[0], 0 | (1<<2) | (0<<4) | (1<<6) );
_mm_shuffle_epi32( dataPtr2[1], 0 | (1<<2) | (0<<4) | (1<<6) );
_mm_shuffle_epi32( dataPtr2[2], 0 | (1<<2) | (0<<4) | (1<<6) );
_mm_shuffle_epi32( dataPtr3[0], 0 | (1<<2) | (0<<4) | (1<<6) );
_mm_shuffle_epi32( dataPtr3[1], 0 | (1<<2) | (0<<4) | (1<<6) );
_mm_shuffle_epi32( dataPtr3[2], 0 | (1<<2) | (0<<4) | (1<<6) );
#endif
}
if ( (outerLoop % 1000) == 0 ) { putchar( '.' ); fflush( stdout ); }
}
timeStamp = clock() - timeStamp;
printf( "\nRun was %lu seconds.\n", timeStamp / CLOCKS_PER_SEC );
free( dataOrig );
free( workQueue );
return 0;
}
我甚至写了一个8x展开的循环,它仍然可以完美地缩放到10秒的运行时间。我很惊讶它并没有在那里饱和,因为那时我肯定会用完寄存器,拿着16个独特的指针。那么我可以从中学到什么呢?我的内部循环代码是如此紧密,以至于它被循环结构本身的开销大大黯然失色?这段代码中是否有任何我没有看到的错误?所有版本都使用gcc -O2
。
答案 0 :(得分:1)
如果您的数据存储在内存中,请不要期望加速;从记忆中预取几乎没有什么可以改进的。
循环时间为150 ns,64字节缓存线和4GB /秒流传输速率(我的AMD系统; Intel更快),并使用48字节(3 x 128位)的每个64字节缓存线读取,系统每秒获取320 MB的可用数据。预取使速率接近4000 MB / s的峰值。每次读取320 MB时,预取的总节省量为0.92秒。在320 MB / s,270秒(4m 30s)是840 GB的内存传输时间; progran可能不会花费超过这一小部分(<1%,8GB)实际读取内存。完全消除内存i / o将节省1%的运行时间。
预取太多的缺点是预取数据会从接近cpu的真正快速但非常小的内存中取代有用的东西(1级和2级缓存,千字节而不是兆字节)。这可能是某些测试运行的悲观表现。
展开循环加速了程序,但预取并没有表明处理本身是主要的瓶颈,而不是数据移动。