我已经为下面的c函数编写了arm v7汇编代码。但是与C代码相比,我们的汇编代码需要更多时间。请任何人都能告诉我原因。
int get_maximum_sample_value (short int *inp_frame, int frame_size) {
short int *temp_buff = inp_frame; // Holds the local pointer.
int maximum_value = -1000; // Holds the maximum value.
int abs_value = 0; // Holds the absolute value.
// Get the maximum value of the frame.
for (int index = 0; index < frame_size; ++index) {
abs_value = abs(*temp_buff);
if (maximum_value < abs_value) {
maximum_value = abs_value;
}
++temp_buff;
}
return maximum_value;
}
ASM:
.cfi_startproc
push{r4}
ldr r4,LC_P1000 // LC_P1000 = -1000
vdup.s32 q2,r4
cmp r1, #0
beq LP_VD_END
lsrs r4,r1,#2
beq LP_VD_END
LP_VD1:
vldm r0,{d0}
add r0,#8
vmovl.s16 q1,d0
vabs.s32 q1,q1
subs r4, r4, #1
vmax.s32 q2,q1,q2
bne LP_VD1
vmax.s32 d4,d5,d4
vmov r0,s8
vmov r2,s9
cmp r0, r2
it lt
movlt r0, r2
LP_VD_END:
pop{r4}
bx lr
.cfi_endproc
答案 0 :(得分:3)
很难说为什么手写程序集比C更慢而没有看到编译器输出,而不知道编译器是否进行自动向量化等。但是很容易说出为什么汇编代码是(非常)慢:
用NEON内在函数编写的快速实现可能如下所示:
int16_t* buf = inp_frame;
// These variables hold the absolute values during the loop.
// Must use 32-bit values because abs(INT16_MIN) doesn't fit in 16-bit signed int.
int32x4_t max0 = vmovq_n_s32(INT16_MIN);
int32x4_t max1 = vmovq_n_s32(INT16_MIN);
int32x4_t max2 = vmovq_n_s32(INT16_MIN);
int32x4_t max3 = vmovq_n_s32(INT16_MIN);
int32x4_t max4 = vmovq_n_s32(INT16_MIN);
int32x4_t max5 = vmovq_n_s32(INT16_MIN);
int32x4_t max6 = vmovq_n_s32(INT16_MIN);
int32x4_t max7 = vmovq_n_s32(INT16_MIN);
// Process 32 values = 64 bytes per iteration.
for(int i = frame_size / 32; i != 0; i--)
{
// Prefetch data 8 64-byte cache lines ahead (or 16, optimal distance depends on hw).
__prefetch(8 * 64 + ((int8_t*)buf)); // whatever intrinsic your compiler has
int16x8_t val0 = vld1q_s16(buf);
int16x8_t val1 = vld1q_s16(buf + 8);
int16x8_t val2 = vld1q_s16(buf + 16);
int16x8_t val3 = vld1q_s16(buf + 24);
buf += 32;
// Widen the values before taking abs.
int32x4_t vall0 = vmovl_s16(vget_low_s16(val0));
int32x4_t vall1 = vmovl_s16(vget_high_s16(val0));
int32x4_t vall2 = vmovl_s16(vget_low_s16(val1));
int32x4_t vall3 = vmovl_s16(vget_high_s16(val1));
int32x4_t vall4 = vmovl_s16(vget_low_s16(val2));
int32x4_t vall5 = vmovl_s16(vget_high_s16(val2));
int32x4_t vall6 = vmovl_s16(vget_low_s16(val3));
int32x4_t vall7 = vmovl_s16(vget_high_s16(val3));
int32x4_t abs_vall0 = vabsq_s32(vall0);
int32x4_t abs_vall1 = vabsq_s32(vall1);
int32x4_t abs_vall2 = vabsq_s32(vall2);
int32x4_t abs_vall3 = vabsq_s32(vall3);
int32x4_t abs_vall4 = vabsq_s32(vall4);
int32x4_t abs_vall5 = vabsq_s32(vall5);
int32x4_t abs_vall6 = vabsq_s32(vall6);
int32x4_t abs_vall7 = vabsq_s32(vall7);
max0 = vmaxq_s32(max0, abs_vall0);
max1 = vmaxq_s32(max1, abs_vall1);
max2 = vmaxq_s32(max2, abs_vall2);
max3 = vmaxq_s32(max3, abs_vall3);
max4 = vmaxq_s32(max4, abs_vall4);
max5 = vmaxq_s32(max5, abs_vall5);
max6 = vmaxq_s32(max6, abs_vall6);
max7 = vmaxq_s32(max7, abs_vall7);
}
// Reduce the maximum value to a single one.
int32x4_t max01 = vmaxq_s32(max0, max1);
int32x4_t max23 = vmaxq_s32(max2, max3);
int32x4_t max45 = vmaxq_s32(max4, max5);
int32x4_t max67 = vmaxq_s32(max6, max7);
int32x4_t max0123 = vmaxq_s32(max01, max23);
int32x4_t max4567 = vmaxq_s32(max45, max67);
int32x4_t qmax = vmaxq_s32(max0123, max4567);
// Horizontal max inside q-register.
int32x2_t dmax = vmax_s32(vget_low_s32(qmax), vget_high_s32(qmax));
int32_t max_value = vget_lane_s32(vpmax_s32(dmax, dmax), 0);
// TODO process any remaining items here
这种交错产生了大量的指令级并行性,允许内核在每个周期执行指令而不是由于数据依赖性而停止。 8路交错/展开足以保持最快的Cortex-A72,每个时钟可以执行所有这些3周期延迟NEON ALU指令中的2个,繁忙。请注意,代码使用了所有可用的16个架构q寄存器,因此您可能需要检查编译器是否不会将它们中的任何一个溢出到堆栈中(所有编译器都不能很好地处理这种情况)。