我正在查看GCC-4.8为x86_64生成的代码,并想知道是否有更好(更快)的方法来计算最少三个值。
以下是Python的集合模块的摘录,该模块计算m
,rightindex+1
和leftindex
的最小值:
ssize_t m = n;
if (m > rightindex + 1)
m = rightindex + 1;
if (m > leftindex)
m = leftindex;
GCC使用CMOV生成序列相关的代码:
leaq 1(%rbp), %rdx
cmpq %rsi, %rdx
cmovg %rsi, %rdx
cmpq %rbx, %rdx
cmovg %rbx, %rdx
是否有更快的代码可以通过删除数据依赖性来利用处理器无序并行执行?我想知道是否存在用于计算多个值的最小值而不使用条件或谓词指令的已知技巧。我也想知道是否有一些饱和算术内在函数可以帮助解决这种情况。
编辑:
答案 0 :(得分:9)
以下是建议方法的基准测试结果。处理器是Intel Core-i7 2600K,运行频率为4GHz。每个循环的单位是纳秒(越小越好)。测量包括伪随机测试数据生成和函数调用开销,以及测试中的实际min3代码。
gcc cmov conditional classical
sequence branches branchless
pseudo-random data 2.24 6.31 2.39
fixed data 2.24 2.99 2.39
functions.s:评估的3个解决方案:
//----------------------------------------------------------------------------
.text
.intel_syntax noprefix
//-----------------------------------------------------------------------------
// uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_a
min3_a:
mov rax, rcx
cmp rax, rdx
cmovg rax, rdx
cmp rax, r8
cmovg rax, r8
retq
//-----------------------------------------------------------------------------
// uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_b
min3_b:
mov rax, rcx
cmp rax, rdx
jle skip1
mov rax, rdx
skip1:
cmp rax, r8
jle skip2
mov rax, r8
skip2:
retq
//-----------------------------------------------------------------------------
// uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_c
min3_c:
sub rdx, rcx
sbb rax, rax
and rax, rdx
add rcx, rax
sub r8, rcx
sbb rax, rax
and rax, r8
add rax, rcx
retq
//-----------------------------------------------------------------------------
min3test.c,主程序(mingw64 / windows):
#define __USE_MINGW_ANSI_STDIO 1
#include <windows.h>
#include <stdio.h>
#include <stdint.h>
uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8);
uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8);
uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8);
//-----------------------------------------------------------------------------
//
// queryPerformanceCounter - similar to QueryPerformanceCounter, but returns
// count directly.
uint64_t queryPerformanceCounter (void)
{
LARGE_INTEGER int64;
QueryPerformanceCounter (&int64);
return int64.QuadPart;
}
//-----------------------------------------------------------------------------
//
// queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns count direcly.
uint64_t queryPerformanceFrequency (void)
{
LARGE_INTEGER int64;
QueryPerformanceFrequency (&int64);
return int64.QuadPart;
}
//----------------------------------------------------------------------------
//
// lfsr64gpr - left shift galois type lfsr for 64-bit data, general purpose register implementation
//
static uint64_t lfsr64gpr (uint64_t data, uint64_t mask)
{
uint64_t carryOut = data >> 63;
uint64_t maskOrZ = -carryOut;
return (data << 1) ^ (maskOrZ & mask);
}
//---------------------------------------------------------------------------
uint64_t min3 (uint64_t a, uint64_t b, uint64_t c)
{
uint64_t smallest;
smallest = a;
if (smallest > b) smallest = b;
if (smallest > c) smallest = c;
return smallest;
}
//---------------------------------------------------------------------------
static void runtests (uint64_t pattern, uint64_t mask)
{
uint64_t startCount, elapsed, index, loops = 800000000;
double ns;
startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
pattern = lfsr64gpr (pattern, mask);
min3_a (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("%7.2f ns\n", ns);
startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
pattern = lfsr64gpr (pattern, mask);
min3_b (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("%7.2f ns\n", ns);
startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
pattern = lfsr64gpr (pattern, mask);
min3_c (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("%7.2f ns\n", ns);
}
//---------------------------------------------------------------------------
int main (void)
{
uint64_t index, pattern, mask;
uint64_t count_a = 0, count_b = 0, count_c = 0;
mask = 0xBEFFFFFFFFFFFFFF;
pattern = 1;
// sanity check the asm functions
for (index = 0; index < 1000000; index++)
{
uint64_t expected, result_a, result_b, result_c;
uint64_t pattern1 = (pattern >> 0) & 0xFFFFF;
uint64_t pattern2 = (pattern >> 20) & 0xFFFFF;
uint64_t pattern3 = (pattern >> 40) & 0xFFFFF;
pattern = lfsr64gpr (pattern, mask);
expected = min3 (pattern1, pattern2, pattern3);
result_a = min3_a (pattern1, pattern2, pattern3);
result_b = min3_b (pattern1, pattern2, pattern3);
result_c = min3_c (pattern1, pattern2, pattern3);
if (result_a != expected) printf ("min3_a fail, %llu %llu %llu %llu %llu\n", expected, result_a, pattern1, pattern2, pattern3);
if (result_b != expected) printf ("min3_b fail, %llu %llu %llu %llu %llu\n", expected, result_b, pattern1, pattern2, pattern3);
if (result_c != expected) printf ("min3_c fail, %llu %llu %llu %llu %llu\n", expected, result_c, pattern1, pattern2, pattern3);
if (expected == pattern1) count_a++;
if (result_b == pattern2) count_b++;
if (result_c == pattern3) count_c++;
}
printf ("pseudo-random distribution: %llu, %llu, %llu\n", count_a, count_b, count_c);
// raise our priority to increase measurement accuracy
SetPriorityClass (GetCurrentProcess (), REALTIME_PRIORITY_CLASS);
printf ("using pseudo-random data\n");
runtests (1, mask);
printf ("using fixed data\n");
runtests (0, mask);
return 0;
}
//---------------------------------------------------------------------------
构建命令行:
gcc -Wall -Wextra -O3 -omin3test.exe min3test.c functions.s
答案 1 :(得分:6)
至少有两个无符号数具有经典解:
; eax = min(eax, ebx), ecx - scratch register.
.min2:
sub ebx, eax
sbb ecx, ecx
and ecx, ebx
add eax, ecx
这种方法可能比使用cmov的解决方案更快,但是为了更高的速度,指令必须由其他并行执行的指令分开。
可以对三个数字实施此方法:
; eax = min(eax, ebx, edx), ecx - scratch register.
.min3:
sub ebx, eax
sbb ecx, ecx
and ecx, ebx
add eax, ecx
sub edx, eax
sbb ecx, ecx
and ecx, edx
add eax, ecx
另一种尝试是使用条件跳转来测试变体。对于现代处理器,它可能更快,特别是如果跳跃是高度可预测的:
.min3:
cmp eax, ebx
jle @f
mov eax, ebx
@@:
cmp eax, edx
jle @f
mov eax, edx
@@:
答案 2 :(得分:1)
函数min(x,y,z)
是连续的,但其衍生物不是。该衍生物在其定义的任何地方都有规范1。没有办法将其表达为算术函数。
饱和算术有其自身的不连续性,因此在这种情况下不能使用先前的推理。但是,饱和点与输入无关。这反过来意味着你需要扩展输入,此时我相信结果代码不会更快。
这当然不能完全证明不存在更快的代码,但你可能需要对其进行详尽的搜索。