此代码以四种方式转换矩阵。第一个是顺序写入,非顺序读取。第二个是相反的。接下来的两个是相同的,但缓存跳过写入。似乎发生的是顺序写入更快,并且跳过缓存更快。我不明白的是,如果跳过缓存,为什么顺序写入仍然更快?
QueryPerformanceCounter(&before);
for (i = 0; i < N; ++i)
for (j = 0; j < N; ++j)
tmp[i][j] = mul2[j][i];
QueryPerformanceCounter(&after);
printf("Transpose 1:\t%ld\n", after.QuadPart - before.QuadPart);
QueryPerformanceCounter(&before);
for (j = 0; j < N; ++j)
for (i = 0; i < N; ++i)
tmp[i][j] = mul2[j][i];
QueryPerformanceCounter(&after);
printf("Transpose 2:\t%ld\n", after.QuadPart - before.QuadPart);
QueryPerformanceCounter(&before);
for (i = 0; i < N; ++i)
for (j = 0; j < N; ++j)
_mm_stream_si32(&tmp[i][j], mul2[j][i]);
QueryPerformanceCounter(&after);
printf("Transpose 3:\t%ld\n", after.QuadPart - before.QuadPart);
QueryPerformanceCounter(&before);
for (j = 0; j < N; ++j)
for (i = 0; i < N; ++i)
_mm_stream_si32(&tmp[i][j], mul2[j][i]);
QueryPerformanceCounter(&after);
printf("Transpose 4:\t%ld\n", after.QuadPart - before.QuadPart);
编辑:输出
Transpose 1: 47603
Transpose 2: 92449
Transpose 3: 38340
Transpose 4: 69597
答案 0 :(得分:4)
CPU具有写入组合缓冲区,用于将高速缓存行上的写入组合在一个突发中。在这种情况下(为顺序写入跳过缓存),此写入组合缓冲区用作作为单行缓存,这使得结果非常类似于不跳过缓存。
确切地说,如果跳过缓存,写入仍然会以突发的形式发生在内存中。
请在此处查看write-combining logic行为。
答案 1 :(得分:0)
您可以尝试矩阵的非线性内存布局,以提高缓存利用率。使用4x4 32位浮动磁贴,可以进行转置,只需对每个缓存线进行单一访问。另外,使用_MM_TRANSPOSE4_PS可以轻松完成奖励磁贴转置。
转置一个非常大的矩阵仍然是非常耗费内存的操作。它仍将受到很大的带宽限制,但至少缓存字负载接近最佳。我不知道性能是否仍然可以优化。我的测试显示,几年前的笔记本电脑设法在大约300毫秒内转换16k * 16k(1G内存)。
我也尝试使用_mm_stream_sd,但实际上由于某些原因导致性能下降。我不理解非时间内存写入足以实际猜测为什么性能会因_mm_stream_ps而下降。可能的原因当然是高速缓存行已经在L1高速缓存中准备好进行写操作。
但实际上非线性矩阵的重要部分可能完全避免转置,并且简单地以瓦片友好顺序运行乘法。但我只转换了我用来提高算法中缓存管理知识的代码。
我还没有尝试测试预取是否会提高内存带宽使用率。当前代码每个周期运行大约0.5条指令(良好的缓存友好代码在此CPU上每个周期运行大约2个ins),这为预取指令留下了大量的空闲周期,从而允许甚至非常复杂的计算来优化运行时的预取时序。
来自我的转置基准测试的示例代码如下。
#define MATSIZE 16384
#define align(val, a) (val + (a - val % a))
#define tilewidth 4
typedef int matrix[align(MATSIZE,tilewidth)*MATSIZE] __attribute__((aligned(64)));
float &index(matrix m, unsigned i, unsigned j)
{
/* tiled address calculation */
/* single cache line is used for 4x4 sub matrices (64 bytes = 4*4*sizeof(int) */
/* tiles are arranged linearly from top to bottom */
/*
* eg: 16x16 matrix tile positions:
* t1 t5 t9 t13
* t2 t6 t10 t14
* t3 t7 t11 t15
* t4 t8 t12 t16
*/
const unsigned tilestride = tilewidth * MATSIZE;
const unsigned comp0 = i % tilewidth; /* i inside tile is least significant part */
const unsigned comp1 = j * tilewidth; /* next part is j multiplied by tile width */
const unsigned comp2 = i / tilewidth * tilestride;
const unsigned add = comp0 + comp1 + comp2;
return m[add];
}
/* Get start of tile reference */
float &tile(matrix m, unsigned i, unsigned j)
{
const unsigned tilestride = tilewidth * MATSIZE;
const unsigned comp1 = j * tilewidth; /* next part is j multiplied by tile width */
const unsigned comp2 = i / tilewidth * tilestride;
return m[comp1 + comp2];
}
template<bool diagonal>
static void doswap(matrix mat, unsigned i, unsigned j)
{
/* special path to swap whole tile at once */
union {
float *fs;
__m128 *mm;
} src, dst;
src.fs = &tile(mat, i, j);
dst.fs = &tile(mat, j, i);
if (!diagonal) {
__m128 srcrow0 = src.mm[0];
__m128 srcrow1 = src.mm[1];
__m128 srcrow2 = src.mm[2];
__m128 srcrow3 = src.mm[3];
__m128 dstrow0 = dst.mm[0];
__m128 dstrow1 = dst.mm[1];
__m128 dstrow2 = dst.mm[2];
__m128 dstrow3 = dst.mm[3];
_MM_TRANSPOSE4_PS(srcrow0, srcrow1, srcrow2, srcrow3);
_MM_TRANSPOSE4_PS(dstrow0, dstrow1, dstrow2, dstrow3);
#if STREAMWRITE == 1
_mm_stream_ps(src.fs + 0, dstrow0);
_mm_stream_ps(src.fs + 4, dstrow1);
_mm_stream_ps(src.fs + 8, dstrow2);
_mm_stream_ps(src.fs + 12, dstrow3);
_mm_stream_ps(dst.fs + 0, srcrow0);
_mm_stream_ps(dst.fs + 4, srcrow1);
_mm_stream_ps(dst.fs + 8, srcrow2);
_mm_stream_ps(dst.fs + 12, srcrow3);
#else
src.mm[0] = dstrow0;
src.mm[1] = dstrow1;
src.mm[2] = dstrow2;
src.mm[3] = dstrow3;
dst.mm[0] = srcrow0;
dst.mm[1] = srcrow1;
dst.mm[2] = srcrow2;
dst.mm[3] = srcrow3;
#endif
} else {
__m128 srcrow0 = src.mm[0];
__m128 srcrow1 = src.mm[1];
__m128 srcrow2 = src.mm[2];
__m128 srcrow3 = src.mm[3];
_MM_TRANSPOSE4_PS(srcrow0, srcrow1, srcrow2, srcrow3);
#if STREAMWRITE == 1
_mm_stream_ps(src.fs + 0, srcrow0);
_mm_stream_ps(src.fs + 4, srcrow1);
_mm_stream_ps(src.fs + 8, srcrow2);
_mm_stream_ps(src.fs + 12, srcrow3);
#else
src.mm[0] = srcrow0;
src.mm[1] = srcrow1;
src.mm[2] = srcrow2;
src.mm[3] = srcrow3;
#endif
}
}
}
static void transpose(matrix mat)
{
const unsigned xstep = 256;
const unsigned ystep = 256;
const unsigned istep = 4;
const unsigned jstep = 4;
unsigned x1, y1, i, j;
/* need to increment x check for y limit to allow unrolled inner loop
* access entries close to diagonal axis
*/
for (x1 = 0; x1 < MATSIZE - xstep + 1 && MATSIZE > xstep && xstep; x1 += xstep)
for (y1 = 0; y1 < std::min(MATSIZE - ystep + 1, x1 + 1); y1 += ystep)
for ( i = x1 ; i < x1 + xstep; i += istep ) {
for ( j = y1 ; j < std::min(y1 + ystep, i); j+= jstep )
{
doswap<false>(mat, i, j);
}
if (i == j && j < (y1 + ystep))
doswap<true>(mat, i, j);
}
for ( i = 0 ; i < x1; i += istep ) {
for ( j = y1 ; j < std::min(MATSIZE - jstep + 1, i); j+= jstep )
{
doswap<false>(mat, i, j);
}
if (i == j)
doswap<true>(mat, i, j);
}
for ( i = x1 ; i < MATSIZE - istep + 1; i += istep ) {
for ( j = y1 ; j < std::min(MATSIZE - jstep + 1, i); j+= jstep )
{
doswap<false>(mat, i, j);
}
if (i == j)
doswap<true>(mat, i, j);
}
x1 = MATSIZE - MATSIZE % istep;
y1 = MATSIZE - MATSIZE % jstep;
for ( i = x1 ; i < MATSIZE; i++ )
for ( j = 0 ; j < std::min((unsigned)MATSIZE, i); j++ )
std::swap(index(mat, i, j+0), index(mat, j+0, i));
for ( i = 0; i < x1; i++ )
for ( j = y1 ; j < std::min((unsigned)MATSIZE, i) ; j++ )
std::swap(index(mat, i, j+0), index(mat, j+0, i));
}