我正在尝试使用SSE指令实现内存复制功能:
typedef unsigned char byte;
typedef unsigned int uint;
__forceinline static void SIMD_Copy(void* __restrict destination, void* __restrict source, const uint count)
{
//assert(count > 16)
#ifdef _M_IX86
const uint register_count = 8;
const uint step = register_count * 16;
const uint loop = count / (step);
#else
const uint register_count = 16;
const uint step = register_count * 16;
const uint loop = count / (step);
#endif
//assert(loop);
byte* from = static_cast<byte*>(source);
byte* to = static_cast<byte*>(destination);
uint debug_test = 0;
register uint counter = 0;
do
{
debug_test += step;
from += step;
to += step;
_mm_prefetch((const char*)(from), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 16), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 32), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 48), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 64), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 80), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 96), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 112), _MM_HINT_T0);
#ifdef _M_AMD64
_mm_prefetch((const char*)(from + 128), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 144), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 160), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 176), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 192), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 208), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 224), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 240), _MM_HINT_T0);
#endif
_mm_store_si128((__m128i*)(to), _mm_load_si128((const __m128i*)(from)));
_mm_store_si128((__m128i*)(to + 16), _mm_load_si128((const __m128i*)(from + 16)));
_mm_store_si128((__m128i*)(to + 32), _mm_load_si128((const __m128i*)(from + 32)));
_mm_store_si128((__m128i*)(to + 48), _mm_load_si128((const __m128i*)(from + 48)));
_mm_store_si128((__m128i*)(to + 64), _mm_load_si128((const __m128i*)(from + 64)));
_mm_store_si128((__m128i*)(to + 80), _mm_load_si128((const __m128i*)(from + 80)));
_mm_store_si128((__m128i*)(to + 96), _mm_load_si128((const __m128i*)(from + 96)));
_mm_store_si128((__m128i*)(to + 112), _mm_load_si128((const __m128i*)(from + 112)));
#ifdef _M_AMD64
_mm_store_si128((__m128i*)(to + 128), _mm_load_si128((const __m128i*)(from + 128)));
_mm_store_si128((__m128i*)(to + 144), _mm_load_si128((const __m128i*)(from + 144)));
_mm_store_si128((__m128i*)(to + 160), _mm_load_si128((const __m128i*)(from + 160)));
_mm_store_si128((__m128i*)(to + 176), _mm_load_si128((const __m128i*)(from + 176)));
_mm_store_si128((__m128i*)(to + 192), _mm_load_si128((const __m128i*)(from + 192)));
_mm_store_si128((__m128i*)(to + 208), _mm_load_si128((const __m128i*)(from + 208)));
_mm_store_si128((__m128i*)(to + 224), _mm_load_si128((const __m128i*)(from + 224)));
_mm_store_si128((__m128i*)(to + 240), _mm_load_si128((const __m128i*)(from + 240)));
#endif
counter++;
}
while(counter < loop);
}
以下是我如何运行它:
byte* arr1 = (byte*)_aligned_malloc(100 * 256, 16);
byte* arr2 = (byte*)_aligned_malloc(100 * 256, 16);
SIMD_Copy(arr2, arr1, 100 * 256);
_aligned_free(arr1);
_aligned_free(arr2);
数组是256的大小倍数,因为它在x64上复制了至少256个字节,因此我保持简单。一旦到达第一个_aligned_free电话,我就会得到:
Unhandled exception at 0x77775C0C (ntdll.dll) in MyProgram.exe: 0xC0000374: A heap has been corrupted (parameters: 0x777A6478).
当我点击继续时,它会继续:
Exception thrown at 0x776DEE01 (ntdll.dll) in MyProgram.exe: 0xC0000005: Access violation reading location 0x00000000.
添加:
_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF | _CRTDBG_CHECK_ALWAYS_DF | _CRTDBG_CHECK_CRT_DF | _CRTDBG_DELAY_FREE_MEM_DF | _CRTDBG_CHECK_EVERY_16_DF);
到主要开始时,似乎并没有帮助。还有其他方法可以找出发生了什么吗?
答案 0 :(得分:3)
from += step;
to += step;
在使用之前发生。所以最后一次迭代在数组末尾超过step
个字节。
我立即看到性能问题:您只需要为每个缓存行(64B)预取一次,而不是每16B预取一次。英特尔IvB在预取方面存在性能缺陷:它根本无法快速退出,因此您实际上可能会遇到预取指令的瓶颈。在其他CPU上,你只是浪费指令吞吐量。
此外,在AMD64上使用更多store(load())
对的重点是什么?您是否期望编译器在商店之前订购负载?如果是这样,您应该将加载结果存储在变量中。我希望这段代码能够编译成movdqa xmm0, [src] / movdqa [dest], xmm0
的序列,重用相同的寄存器。 (这很好,因为寄存器重命名,但不会将商店中的负载分开。)此外,对于大型对齐的16个副本,rep movsd
表现非常好。
rep movsd的一个稍微隐藏的优点是它让前端领先,并且有许多未来的指令排队等待无序核心。复制循环使前端忙于自己。此外,它不需要完美的分支预测,以避免在复制循环结束时出现错误预测。