Cortex-A7上的流水线循环比较幼稚

时间:2016-08-11 05:19:56

标签: optimization arm inline-assembly

我正在使用gcc内联汇编程序在Raspberry Pi 2上执行以下两个bignum乘法例程(我认为)。这些中的每一个都应该是男生乘法的内在部分。

第一个例程基本上是

loop:
ldr %r6, [%3]  ;load the result
ldr %r5, [%0], #4 ; and the other arg
umaal %r6, %r4, %r5, %2 ;do the mult with carry
str %r5, [%3], #4
subs %1, 1
bne loop

第二个例程是软件流水线实现。下面是内在汇编程序,因此有一些引号和换行符。

"mov     r4, #0\n"
"mov     r11, %3\n"
"cmp     %1, #4\n"
"ble     3f\n"

/* Prologue */
"ldr     r5, [r11], #4\n"
"ldr     r6, [%0], #4\n" /* load k */
"ldr     r7, [r11], #4\n"
"ldr     r8, [%0], #4\n" /* load k+1 */
"umaal   r5, r4, r6, %2\n" /* mult k*/
/* Pipelined loop*/
"2:\n"
"ldr     r9, [r11], #4\n" 
"ldr     r10, [%0], #4\n" /*load k+2*/
"umaal   r7, r4, r8, %2\n" /*mult k+1 */
"str     r5, [%3], #4\n"  /*store k */
"ldr     r5, [r11], #4\n"
"ldr     r6, [%0], #4\n" /* load k+3*/
"umaal   r9, r4, r10, %2\n" /*mult k+2 */
"str     r7, [%3], #4\n" /*store k+1*/
"ldr     r7, [r11], #4\n"
"ldr     r8, [%0], #4\n" /*load k+4*/
"umaal   r5, r4, r6, %2\n" /*mult k+3*/
"str     r9, [%3], #4\n" /*store k+2*/
"sub     %1, #3\n"
"cmp     %1, #4\n"
"bgt     2b\n"
/*Epiloge*/
"sub     %1, #2\n" /*adjust iterations*/
"umaal   r7, r4, r8, %2\n" /*mult k+4*/
"str     r5, [%3], #4\n" /*store k+3*/
"str     r7, [%3], #4\n" /*store k+4*/


/*Cleanup loop*/
"3:\n"
"cmp     %1, 0\n"
"beq     5f\n"
"4:\n"
"ldr     r5, [r11], #4\n"
"ldr     r6, [%0], #4\n"
"umaal   r5, r4, r6, %2\n"
"str     r5, [%3], #4\n"
"subs    %1, #1\n"
"bne     4b\n"

"5:\n"
"str     r4, [%3]\n"

第二个明显慢于第一个,大约20%,我不明白为什么:流水线应该确保管道隐藏负载和乘法延迟。是否有我不知道的危险?

0 个答案:

没有答案