使用ARM Neon内在函数优化向量添加

时间:2014-04-18 16:22:13

标签: xcode assembly arm simd neon

我使用ARM Neon内在函数测试了向量中添加所有组件。我有相同功能的非NEON版本和NEON-one。我没有得到任何性能提升,使用内在函数大致相同,有时甚至有点慢。我正在为iPhone4S构建,使用-Os标志,LLVM / Clang,Xcode 5.0.2编译。我的问题是,如何在这里使用NEON指令以提供性能优势。

测试结果:

2014-04-18 20:51:45.587 ARMAssembly [9007:907] 1在185191执行

2014-04-18 20:51:45.596 ARMAssembly [9007:907] 2在180158执行

这是我使用的代码。

int doStuffSIMD(int* arr, int size)
{
    uint32x4_t vec128 = vdupq_n_u32(0);
    for (int* i = &arr[0]; i < &arr[0] + size; i += 4)
    {
        uint32x4_t temp128 = vld1q_u32(i);
        vec128 = vaddq_u32 (vec128, temp128);
    }

    uint32x2_t a = vget_low_u32(vec128);
    uint32x2_t b = vget_high_u32(vec128);

    a = vadd_u32(a, b);
    uint32_t result;
    result = vget_lane_u32(a,0);
    result += vget_lane_u32(a,1);

    return result;
}

int doStuffNorm(int* arr, int size)
{
    int acc = 0;
    for (int i = 0; i < size; i += 1)
    {
        acc += arr[i];
    }
    return acc;
}


int main(int argc, char * argv[])
{

    const int size = 4096 * 4096;
    int *arr = malloc(sizeof(int) * size);

    for (int i = 0; i < size; ++i)
    {
        arr[i] = i;
    }
    clock_t now;
    clock_t now2;

    now = clock();
    int i = doStuffSIMD(arr, size);
    now2 = clock();
    clock_t diff1 = now2 - now;

    now = clock();
    int j = doStuffNorm(arr, size);
    now2 = clock();
    clock_t diff2 = now2 - now;

    free(arr);
    NSLog(@"1 performed in %lu", (diff1));
    NSLog(@"2 performed in %lu", (diff2));

    NSLog(@"RESULT : %i / %i", i, j);


    @autoreleasepool {
        return UIApplicationMain(argc, argv, nil, NSStringFromClass([AppDelegate class]));
    }
}

doStuffSIMD的程序集:

_doStuffSIMD:
    .cfi_startproc
Lfunc_begin0:
    .loc    1 17 0                  @ /Users/karikuvaja/Documents/ARMAssembly/ARMAssembly/main.m:17:0
@ BB#0:
    @DEBUG_VALUE: doStuffSIMD:arr <- R0+0
    @DEBUG_VALUE: doStuffSIMD:size <- R1+0
    @DEBUG_VALUE: i <- R0+0
    vmov.i32    q8, #0x0
Ltmp0:
    @DEBUG_VALUE: doStuffSIMD:vec128 <- Q8+0
    .loc    1 19 0 prologue_end     @ /Users/karikuvaja/Documents/ARMAssembly/ARMAssembly/main.m:19:0
    cmp r1, #1
    blt LBB0_3
@ BB#1:
Ltmp1:
    @DEBUG_VALUE: doStuffSIMD:arr <- R0+0
    @DEBUG_VALUE: doStuffSIMD:size <- R1+0
    @DEBUG_VALUE: i <- R0+0
    @DEBUG_VALUE: doStuffSIMD:vec128 <- Q8+0
    add.w   r1, r0, r1, lsl #2
Ltmp2:
LBB0_2:                                 @ %.lr.ph
                                        @ =>This Inner Loop Header: Depth=1
    .loc    1 21 0                  @ /Users/karikuvaja/Documents/ARMAssembly/ARMAssembly/main.m:21:0
    vld1.32 {d18, d19}, [r0]!
    vadd.i32    q8, q8, q9
Ltmp3:
    @DEBUG_VALUE: doStuffSIMD:vec128 <- Q8+0
    .loc    1 19 0                  @ /Users/karikuvaja/Documents/ARMAssembly/ARMAssembly/main.m:19:0
    cmp r0, r1
    blo LBB0_2
Ltmp4:
LBB0_3:
    vadd.i32    d16, d16, d17
Ltmp5:
    @DEBUG_VALUE: __a <- D16+0
    @DEBUG_VALUE: doStuffSIMD:a <- D16+0
    @DEBUG_VALUE: __a <- D16+0
    .loc    1 31 0                  @ /Users/karikuvaja/Documents/ARMAssembly/ARMAssembly/main.m:31:0
    vmov.32 r0, d16[1]
Ltmp6:
    .loc    1 30 0                  @ /Users/karikuvaja/Documents/ARMAssembly/ARMAssembly/main.m:30:0
    vmov.32 r1, d16[0]
Ltmp7:
    @DEBUG_VALUE: doStuffSIMD:result <- R1+0
    .loc    1 31 0                  @ /Users/karikuvaja/Documents/ARMAssembly/ARMAssembly/main.m:31:0
    add r0, r1
Ltmp8:
    @DEBUG_VALUE: doStuffSIMD:result <- R0+0
    .loc    1 33 0                  @ /Users/karikuvaja/Documents/ARMAssembly/ARMAssembly/main.m:33:0
    bx  lr
Ltmp9:
Lfunc_end0:

更新:

根据评论中BitBank的建议,NEON代码现在的性能比iPhone4s的正常版快几乎快3倍,内存被预取并且循环被展开。我发现预取128个字节最适合这个特定的设备。

优化后的结果:

2014-04-19 14:14:56.507 ARMAssembly [11492:907] 1在70096演出

2014-04-19 14:14:56.513 ARMAssembly [11492:907] 2在205114执行

优化循环如下所示:

int doStuffSIMD(unsigned int* arr, int size)
{


    uint32x4_t vec128 = vdupq_n_u32(0);
    uint32x4_t temp128 = vdupq_n_u32(0);
    for (unsigned int* i = &arr[0]; i < &arr[0] + size; i += 16)
    {

        __builtin_prefetch(i + 32);

        temp128 = vld1q_u32(i);
        vec128 = vaddq_u32 (vec128, temp128);

        temp128 = vld1q_u32(i + 4);
        vec128 = vaddq_u32 (vec128, temp128);

        temp128 = vld1q_u32(i + 8);
        vec128 = vaddq_u32 (vec128, temp128);

        temp128 = vld1q_u32(i + 12);
        vec128 = vaddq_u32 (vec128, temp128);
    }

    uint32x2_t a = vget_low_u32(vec128);
    uint32x2_t b = vget_high_u32(vec128);

    a = vadd_u32(a, b);
    uint32_t result;
    result = vget_lane_u32(a,0);
    result += vget_lane_u32(a,1);

    return result;
}

0 个答案:

没有答案