我使用ARM Neon内在函数测试了向量中添加所有组件。我有相同功能的非NEON版本和NEON-one。我没有得到任何性能提升,使用内在函数大致相同,有时甚至有点慢。我正在为iPhone4S构建,使用-Os标志,LLVM / Clang,Xcode 5.0.2编译。我的问题是,如何在这里使用NEON指令以提供性能优势。
测试结果:
2014-04-18 20:51:45.587 ARMAssembly [9007:907] 1在185191执行
2014-04-18 20:51:45.596 ARMAssembly [9007:907] 2在180158执行
这是我使用的代码。
int doStuffSIMD(int* arr, int size)
{
uint32x4_t vec128 = vdupq_n_u32(0);
for (int* i = &arr[0]; i < &arr[0] + size; i += 4)
{
uint32x4_t temp128 = vld1q_u32(i);
vec128 = vaddq_u32 (vec128, temp128);
}
uint32x2_t a = vget_low_u32(vec128);
uint32x2_t b = vget_high_u32(vec128);
a = vadd_u32(a, b);
uint32_t result;
result = vget_lane_u32(a,0);
result += vget_lane_u32(a,1);
return result;
}
int doStuffNorm(int* arr, int size)
{
int acc = 0;
for (int i = 0; i < size; i += 1)
{
acc += arr[i];
}
return acc;
}
int main(int argc, char * argv[])
{
const int size = 4096 * 4096;
int *arr = malloc(sizeof(int) * size);
for (int i = 0; i < size; ++i)
{
arr[i] = i;
}
clock_t now;
clock_t now2;
now = clock();
int i = doStuffSIMD(arr, size);
now2 = clock();
clock_t diff1 = now2 - now;
now = clock();
int j = doStuffNorm(arr, size);
now2 = clock();
clock_t diff2 = now2 - now;
free(arr);
NSLog(@"1 performed in %lu", (diff1));
NSLog(@"2 performed in %lu", (diff2));
NSLog(@"RESULT : %i / %i", i, j);
@autoreleasepool {
return UIApplicationMain(argc, argv, nil, NSStringFromClass([AppDelegate class]));
}
}
doStuffSIMD的程序集:
_doStuffSIMD:
.cfi_startproc
Lfunc_begin0:
.loc 1 17 0 @ /Users/karikuvaja/Documents/ARMAssembly/ARMAssembly/main.m:17:0
@ BB#0:
@DEBUG_VALUE: doStuffSIMD:arr <- R0+0
@DEBUG_VALUE: doStuffSIMD:size <- R1+0
@DEBUG_VALUE: i <- R0+0
vmov.i32 q8, #0x0
Ltmp0:
@DEBUG_VALUE: doStuffSIMD:vec128 <- Q8+0
.loc 1 19 0 prologue_end @ /Users/karikuvaja/Documents/ARMAssembly/ARMAssembly/main.m:19:0
cmp r1, #1
blt LBB0_3
@ BB#1:
Ltmp1:
@DEBUG_VALUE: doStuffSIMD:arr <- R0+0
@DEBUG_VALUE: doStuffSIMD:size <- R1+0
@DEBUG_VALUE: i <- R0+0
@DEBUG_VALUE: doStuffSIMD:vec128 <- Q8+0
add.w r1, r0, r1, lsl #2
Ltmp2:
LBB0_2: @ %.lr.ph
@ =>This Inner Loop Header: Depth=1
.loc 1 21 0 @ /Users/karikuvaja/Documents/ARMAssembly/ARMAssembly/main.m:21:0
vld1.32 {d18, d19}, [r0]!
vadd.i32 q8, q8, q9
Ltmp3:
@DEBUG_VALUE: doStuffSIMD:vec128 <- Q8+0
.loc 1 19 0 @ /Users/karikuvaja/Documents/ARMAssembly/ARMAssembly/main.m:19:0
cmp r0, r1
blo LBB0_2
Ltmp4:
LBB0_3:
vadd.i32 d16, d16, d17
Ltmp5:
@DEBUG_VALUE: __a <- D16+0
@DEBUG_VALUE: doStuffSIMD:a <- D16+0
@DEBUG_VALUE: __a <- D16+0
.loc 1 31 0 @ /Users/karikuvaja/Documents/ARMAssembly/ARMAssembly/main.m:31:0
vmov.32 r0, d16[1]
Ltmp6:
.loc 1 30 0 @ /Users/karikuvaja/Documents/ARMAssembly/ARMAssembly/main.m:30:0
vmov.32 r1, d16[0]
Ltmp7:
@DEBUG_VALUE: doStuffSIMD:result <- R1+0
.loc 1 31 0 @ /Users/karikuvaja/Documents/ARMAssembly/ARMAssembly/main.m:31:0
add r0, r1
Ltmp8:
@DEBUG_VALUE: doStuffSIMD:result <- R0+0
.loc 1 33 0 @ /Users/karikuvaja/Documents/ARMAssembly/ARMAssembly/main.m:33:0
bx lr
Ltmp9:
Lfunc_end0:
更新:
根据评论中BitBank的建议,NEON代码现在的性能比iPhone4s的正常版快几乎快3倍,内存被预取并且循环被展开。我发现预取128个字节最适合这个特定的设备。
优化后的结果:
2014-04-19 14:14:56.507 ARMAssembly [11492:907] 1在70096演出
2014-04-19 14:14:56.513 ARMAssembly [11492:907] 2在205114执行
优化循环如下所示:
int doStuffSIMD(unsigned int* arr, int size)
{
uint32x4_t vec128 = vdupq_n_u32(0);
uint32x4_t temp128 = vdupq_n_u32(0);
for (unsigned int* i = &arr[0]; i < &arr[0] + size; i += 16)
{
__builtin_prefetch(i + 32);
temp128 = vld1q_u32(i);
vec128 = vaddq_u32 (vec128, temp128);
temp128 = vld1q_u32(i + 4);
vec128 = vaddq_u32 (vec128, temp128);
temp128 = vld1q_u32(i + 8);
vec128 = vaddq_u32 (vec128, temp128);
temp128 = vld1q_u32(i + 12);
vec128 = vaddq_u32 (vec128, temp128);
}
uint32x2_t a = vget_low_u32(vec128);
uint32x2_t b = vget_high_u32(vec128);
a = vadd_u32(a, b);
uint32_t result;
result = vget_lane_u32(a,0);
result += vget_lane_u32(a,1);
return result;
}