与C相比,臂组装代码花费更多时间

时间:2016-02-16 06:39:51

标签: assembly timing neon armv7

我已经为下面的c函数编写了arm v7汇编代码。但是与C代码相比,我们的汇编代码需要更多时间。请任何人都能告诉我原因。

int get_maximum_sample_value (short int *inp_frame, int frame_size) {
    short int *temp_buff = inp_frame; // Holds the local pointer.

    int maximum_value = -1000; // Holds the maximum value.
    int abs_value     = 0;     // Holds the absolute value.

    // Get the maximum value of the frame.
    for (int index = 0; index < frame_size; ++index) {

        abs_value = abs(*temp_buff);

        if (maximum_value < abs_value) {
            maximum_value = abs_value;
        }
        ++temp_buff;
    }

    return maximum_value;
}

ASM:

.cfi_startproc

push{r4}

ldr r4,LC_P1000 // LC_P1000 = -1000
vdup.s32 q2,r4
cmp   r1, #0
beq   LP_VD_END

lsrs r4,r1,#2
beq  LP_VD_END

LP_VD1:

vldm r0,{d0}
add r0,#8
vmovl.s16 q1,d0

vabs.s32 q1,q1
subs r4,  r4, #1
vmax.s32 q2,q1,q2
bne LP_VD1
vmax.s32 d4,d5,d4

vmov r0,s8
vmov r2,s9
cmp r0, r2
it  lt
movlt   r0, r2

LP_VD_END:
pop{r4}
bx lr
.cfi_endproc

1 个答案:

答案 0 :(得分:3)

很难说为什么手写程序集比C更慢而没有看到编译器输出,而不知道编译器是否进行自动向量化等。但是很容易说出为什么汇编代码是(非常)慢:

  • NEON simd指令具有长延迟和高吞吐量。通过仅使用1个maximum_value向量,您已经序列化了原始并行问题。所有向量操作都依赖于前一条指令的结果,迫使它们在执行前等待整个~4周期的延迟。具有有序simd执行流水线的核心问题更严重(除了最新的“大”Cortex-A核心A9,A15,A57,A72以及Apple的一些核心)。
  • 如果输入数组很大且缓存中不存在,则通过等待内存操作完成来限制代码。一些ARM处理器具有硬件L2内存预取程序,但即使在这些预取中,软件中的内存也可以多次加速循环。

用NEON内在函数编写的快速实现可能如下所示:

int16_t* buf = inp_frame;

// These variables hold the absolute values during the loop.
// Must use 32-bit values because abs(INT16_MIN) doesn't fit in 16-bit signed int.
int32x4_t max0 = vmovq_n_s32(INT16_MIN);
int32x4_t max1 = vmovq_n_s32(INT16_MIN);
int32x4_t max2 = vmovq_n_s32(INT16_MIN);
int32x4_t max3 = vmovq_n_s32(INT16_MIN);
int32x4_t max4 = vmovq_n_s32(INT16_MIN);
int32x4_t max5 = vmovq_n_s32(INT16_MIN);
int32x4_t max6 = vmovq_n_s32(INT16_MIN);
int32x4_t max7 = vmovq_n_s32(INT16_MIN);

// Process 32 values = 64 bytes per iteration.
for(int i = frame_size / 32; i != 0; i--)
{
    // Prefetch data 8 64-byte cache lines ahead (or 16, optimal distance depends on hw).
    __prefetch(8 * 64 + ((int8_t*)buf)); // whatever intrinsic your compiler has

    int16x8_t val0 = vld1q_s16(buf);
    int16x8_t val1 = vld1q_s16(buf + 8);
    int16x8_t val2 = vld1q_s16(buf + 16);
    int16x8_t val3 = vld1q_s16(buf + 24);
    buf += 32;

    // Widen the values before taking abs.
    int32x4_t vall0 = vmovl_s16(vget_low_s16(val0));
    int32x4_t vall1 = vmovl_s16(vget_high_s16(val0));
    int32x4_t vall2 = vmovl_s16(vget_low_s16(val1));
    int32x4_t vall3 = vmovl_s16(vget_high_s16(val1));
    int32x4_t vall4 = vmovl_s16(vget_low_s16(val2));
    int32x4_t vall5 = vmovl_s16(vget_high_s16(val2));
    int32x4_t vall6 = vmovl_s16(vget_low_s16(val3));
    int32x4_t vall7 = vmovl_s16(vget_high_s16(val3));

    int32x4_t abs_vall0 = vabsq_s32(vall0);
    int32x4_t abs_vall1 = vabsq_s32(vall1);
    int32x4_t abs_vall2 = vabsq_s32(vall2);
    int32x4_t abs_vall3 = vabsq_s32(vall3);
    int32x4_t abs_vall4 = vabsq_s32(vall4);
    int32x4_t abs_vall5 = vabsq_s32(vall5);
    int32x4_t abs_vall6 = vabsq_s32(vall6);
    int32x4_t abs_vall7 = vabsq_s32(vall7);

    max0 = vmaxq_s32(max0, abs_vall0);
    max1 = vmaxq_s32(max1, abs_vall1);
    max2 = vmaxq_s32(max2, abs_vall2);
    max3 = vmaxq_s32(max3, abs_vall3);
    max4 = vmaxq_s32(max4, abs_vall4);
    max5 = vmaxq_s32(max5, abs_vall5);
    max6 = vmaxq_s32(max6, abs_vall6);
    max7 = vmaxq_s32(max7, abs_vall7);
}

// Reduce the maximum value to a single one.
int32x4_t max01 = vmaxq_s32(max0, max1);
int32x4_t max23 = vmaxq_s32(max2, max3);
int32x4_t max45 = vmaxq_s32(max4, max5);
int32x4_t max67 = vmaxq_s32(max6, max7);

int32x4_t max0123 = vmaxq_s32(max01, max23);
int32x4_t max4567 = vmaxq_s32(max45, max67);
int32x4_t qmax = vmaxq_s32(max0123, max4567);

// Horizontal max inside q-register.
int32x2_t dmax = vmax_s32(vget_low_s32(qmax), vget_high_s32(qmax));
int32_t max_value = vget_lane_s32(vpmax_s32(dmax, dmax), 0);

// TODO process any remaining items here

这种交错产生了大量的指令级并行性,允许内核在每个周期执行指令而不是由于数据依赖性而停止。 8路交错/展开足以保持最快的Cortex-A72,每个时钟可以执行所有这些3周期延迟NEON ALU指令中的2个,繁忙。请注意,代码使用了所有可用的16个架构q寄存器,因此您可能需要检查编译器是否不会将它们中的任何一个溢出到堆栈中(所有编译器都不能很好地处理这种情况)。