我正在尝试使用ARM内在函数实现算法。
在算法的步骤中,需要对有符号整数进行右移,但是需要向上舍入负值(即负值较小)。例如,如果向右移动,则-8应该给出-4,但-1应该给出0。
换句话说,我想要一些将负值舍入为零而不是向下舍入的东西:
int rightshift(int number, unsigned int shift)
{
return ((number < 0) ? -1 : 1) * (abs(number) >> shift);
}
我无法以SIMD方式找到合适的功能。有没有办法在一个函数调用中执行此操作,或者可以使用某些技巧?
答案 0 :(得分:1)
我认为没有一个单向指令的转变,而且是零向行为。
但是,你可以通过一些移位和掩码指令相当简单地完成它。我们需要做的是在结果中添加一个,如果我们以负数开头并且有一个'进位'(即结果右边的任何位都是1)。
我可以用以下纯C代码证明这一点:
#include <stddef.h>
#include <limits.h>
int16_t rightshift(int number, unsigned int shift)
{
static const size_t bits = sizeof number * CHAR_BIT;
number += ((1<<shift) - 1) & (number >> bits-1);
return number >> shift;
}
#include <stdio.h>
int main() {
for (int i = -16; i <= 16; ++i) {
printf(" %3d: ", i);
for (int j = 0; j < 4; ++j)
printf("%4d", rightshift(i, j));
puts("");
}
}
这编译成一些很好的分支费用程序集,看起来很适合内联(特别是当shift
是编译时常量时):
rightshift:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
movs r3, #1
lsls r3, r3, r1
subs r3, r3, #1
and r3, r3, r0, asr #31
add r0, r0, r3
asrs r0, r0, r1
bx lr
为了瞄准Neon,我写了另一个函数,用多个数据来练习它:
void do_shift(int16_t *restrict dest, const int16_t *restrict src,
size_t count, unsigned int shift)
{
for (size_t j = 0; j < count; ++j) {
dest[j] = rightshift(src[j], shift);
}
}
它的测试程序:
#include <stdio.h>
int main() {
static const int16_t src[] = {
-32768, -32767, -32766, -32765, -32764,
-16384, -16383, -16382, -16381, -16380,
-8193, -8192, -8191, -8190, -8189,
-16, -15, -14, -13, -12, -10, -9,
-8, -7, -6, -5, -4, -3, -2, -1, 0,
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
1023, 1024, 32767,
};
static const size_t count = sizeof src / sizeof *src;
int16_t dest[16][count];
for (unsigned int i = 0; i < 16; ++i) {
do_shift(dest[i], src, count, i);
}
for (size_t i = 0; i < count; ++i) {
printf("%7d: ", src[i]);
for (int j = 0; j < 16; ++j)
printf("%7d", dest[j][i]);
puts("");
}
}
我用gcc -O3 -march=armv7 -mfpu=neon
编译了这个。我承认我不熟悉霓虹灯的说明,但结果可能很有启发性:
do_shift:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
cmp r2, #0
beq .L21
push {r4, r5, r6, r7, r8, lr}
ubfx r4, r1, #1, #2
negs r4, r4
movs r5, #1
and r4, r4, #7
lsls r5, r5, r3
adds r7, r4, #7
subs r6, r2, #1
subs r5, r5, #1
cmp r6, r7
sxth r5, r5
bcc .L8
cmp r4, #0
beq .L9
ldrsh r7, [r1]
cmp r4, #1
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0] @ movhi
beq .L9
ldrsh r7, [r1, #2]
cmp r4, #2
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #2] @ movhi
beq .L9
ldrsh r7, [r1, #4]
cmp r4, #3
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #4] @ movhi
beq .L9
ldrsh r7, [r1, #6]
cmp r4, #4
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #6] @ movhi
beq .L9
ldrsh r7, [r1, #8]
cmp r4, #5
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #8] @ movhi
beq .L9
ldrsh r7, [r1, #10]
cmp r4, #7
ite eq
moveq r8, r4
movne r8, #6
and r6, r5, r7, asr #31
add r6, r6, r7
it eq
ldrsheq r7, [r1, #12]
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #10] @ movhi
itttt eq
andeq r6, r5, r7, asr #31
addeq r6, r6, r7
sxtheq r6, r6
asreq r6, r6, r3
it eq
strheq r6, [r0, #12] @ movhi
.L4:
vdup.32 q10, r3
sub lr, r2, r4
lsls r4, r4, #1
movs r7, #0
vneg.s32 q10, q10
adds r6, r1, r4
lsr ip, lr, #3
add r4, r4, r0
vdup.16 q12, r5
.L6:
adds r7, r7, #1
adds r6, r6, #16
vldr d18, [r6, #-16]
vldr d19, [r6, #-8]
cmp r7, ip
vshr.s16 q8, q9, #15
vand q8, q8, q12
vadd.i16 q8, q8, q9
vmovl.s16 q9, d16
vmovl.s16 q8, d17
vshl.s32 q9, q9, q10
vshl.s32 q8, q8, q10
vmovn.i32 d22, q9
vmovn.i32 d23, q8
vst1.16 {q11}, [r4]
add r4, r4, #16
bcc .L6
bic r6, lr, #7
cmp lr, r6
add r4, r8, r6
beq .L1
.L3:
ldrsh ip, [r1, r4, lsl #1]
adds r7, r4, #1
cmp r2, r7
and r6, r5, ip, asr #31
add r6, r6, ip
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r4, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #2
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
adds r7, r4, #3
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #4
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
adds r7, r4, #5
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #6
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
adds r7, r4, #7
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #8
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
add r7, r4, #9
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #10
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
add r7, r4, #11
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #12
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh r7, [r1, ip, lsl #1]
adds r4, r4, #13
cmp r2, r4
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh r1, [r1, r4, lsl #1]
and r2, r5, r1, asr #31
add r2, r2, r1
sxth r2, r2
asr r3, r2, r3
strh r3, [r0, r4, lsl #1] @ movhi
.L1:
pop {r4, r5, r6, r7, r8, pc}
.L9:
mov r8, r4
b .L4
.L21:
bx lr
.L8:
movs r4, #0
b .L3
有很多循环展开使代码更长,但模式应该清晰。