我正在使用gcc内联汇编程序在Raspberry Pi 2上执行以下两个bignum乘法例程(我认为)。这些中的每一个都应该是男生乘法的内在部分。
第一个例程基本上是
loop:
ldr %r6, [%3] ;load the result
ldr %r5, [%0], #4 ; and the other arg
umaal %r6, %r4, %r5, %2 ;do the mult with carry
str %r5, [%3], #4
subs %1, 1
bne loop
第二个例程是软件流水线实现。下面是内在汇编程序,因此有一些引号和换行符。
"mov r4, #0\n"
"mov r11, %3\n"
"cmp %1, #4\n"
"ble 3f\n"
/* Prologue */
"ldr r5, [r11], #4\n"
"ldr r6, [%0], #4\n" /* load k */
"ldr r7, [r11], #4\n"
"ldr r8, [%0], #4\n" /* load k+1 */
"umaal r5, r4, r6, %2\n" /* mult k*/
/* Pipelined loop*/
"2:\n"
"ldr r9, [r11], #4\n"
"ldr r10, [%0], #4\n" /*load k+2*/
"umaal r7, r4, r8, %2\n" /*mult k+1 */
"str r5, [%3], #4\n" /*store k */
"ldr r5, [r11], #4\n"
"ldr r6, [%0], #4\n" /* load k+3*/
"umaal r9, r4, r10, %2\n" /*mult k+2 */
"str r7, [%3], #4\n" /*store k+1*/
"ldr r7, [r11], #4\n"
"ldr r8, [%0], #4\n" /*load k+4*/
"umaal r5, r4, r6, %2\n" /*mult k+3*/
"str r9, [%3], #4\n" /*store k+2*/
"sub %1, #3\n"
"cmp %1, #4\n"
"bgt 2b\n"
/*Epiloge*/
"sub %1, #2\n" /*adjust iterations*/
"umaal r7, r4, r8, %2\n" /*mult k+4*/
"str r5, [%3], #4\n" /*store k+3*/
"str r7, [%3], #4\n" /*store k+4*/
/*Cleanup loop*/
"3:\n"
"cmp %1, 0\n"
"beq 5f\n"
"4:\n"
"ldr r5, [r11], #4\n"
"ldr r6, [%0], #4\n"
"umaal r5, r4, r6, %2\n"
"str r5, [%3], #4\n"
"subs %1, #1\n"
"bne 4b\n"
"5:\n"
"str r4, [%3]\n"
第二个明显慢于第一个,大约20%,我不明白为什么:流水线应该确保管道隐藏负载和乘法延迟。是否有我不知道的危险?