xiaomi5s

时间:2017-10-13 16:47:32

标签: arm neon

考虑以下代码,第一个代码段:

void run_new(const float* src, float* dst,
        size_t IH, size_t IW, size_t OH, size_t OW,
        size_t N) {
    rep(n, N) {
        const float* src_ptr = src + IW * IH * n;
        float* outptr = dst;

        const float* r0 = src_ptr;
        const float* r1 = src_ptr + IW;

        float32x4_t k0123 = vdupq_n_f32(3.f);
        rep(h, OH) {
            size_t width = OW >> 2;

            asm volatile(
                    "dup v21.4s, %4.s[0] \n"
                    "dup v22.4s, %4.s[1] \n"
                    "dup v23.4s, %4.s[2] \n"
                    "dup v24.4s, %4.s[3] \n"
                    "mov x3, xzr \n"
                    "0:           \n"
                    "ldr q0, [%1] \n"
                    "ld1 {v1.4s, v2.4s}, [%2], #32 \n"

                    "add x3, x3, #0x1 \n"
                    "cmp %0, x3 \n"

                    "ld1 {v3.4s, v4.4s}, [%3], #32 \n"
                    "fmla v0.4s, v1.4s, v21.4s \n"  // src[i] * k[i]
                    "fmla v0.4s, v2.4s, v22.4s \n"

                    "fmla v0.4s, v3.4s, v23.4s \n"
                    "fmla v0.4s, v4.4s, v24.4s \n"

                    "str q0, [%1], #16 \n"
                    "bne 0b \n"
                    : "+r"(width), "+r"(outptr), "+r"(r0), "+r"(r1)
                    : "w"(k0123)
                          : "cc", "memory", "x3", "v0", "v1", "v2", "v3", "v4", "v21", "v22", "v23", "v24");
        }

    }
}

第二个代码段:

 void run_origin(const float* src, float* dst,
        size_t IH, size_t IW, size_t OH, size_t OW,
        size_t N) {

    rep(n, N) {
        const float* src_ptr = src + IW * IH * n;
        float* outptr = dst;

        const float* r0 = src_ptr;
        const float* r1 = src_ptr + IW;

        float32x4_t k0123 = vdupq_n_f32(3.f);
        rep(h, OH) {
            size_t width = OW >> 2;

            asm volatile(
                    "dup v21.4s, %4.s[0] \n"
                    "dup v22.4s, %4.s[1] \n"
                    "dup v23.4s, %4.s[2] \n"
                    "dup v24.4s, %4.s[3] \n"
                    "mov x3, xzr \n"
                    "mov x4, xzr \n"
                    "0:           \n"
                    "add x19, %2, x4 \n"
                    "ldr q0, [%1] \n"  // load dst 0, 1, 2, 3
                    "ld1 {v1.4s, v2.4s}, [x19]\n"  // 1, 2, 4, 6

                    "add x3, x3, #0x1 \n"
                    "cmp %0, x3 \n"

                    "add x19, %3, x4 \n"
                    "ld1 {v3.4s, v4.4s}, [x19]\n"
                    "fmla v0.4s, v1.4s, v21.4s \n"  // src[i] * k[i]
                    "fmla v0.4s, v2.4s, v22.4s \n"

                    "fmla v0.4s, v3.4s, v23.4s \n"
                    "fmla v0.4s, v4.4s, v24.4s \n"

                    "add x4, x4, #0x20 \n"
                    "str q0, [%1], #16 \n"
                    "bne 0b \n"
                    "add %2, %2, x4 \n"
                    "add %3, %3, x4 \n"
                    : "+r"(width), "+r"(outptr), "+r"(r0), "+r"(r1)
                    : "w"(k0123)
                          : "cc", "memory", "x3", "x4", "x19", "v0", "v1", "v2", "v3", "v4", "v21", "v22", "v23", "v24");

        }

    }
}

Test performance of arm neon assembly

中的所有代码

我在xiaomi5sxiaomi6redmi上测试了这两个代码的效果,效果的细节是:

  

N:12 IH:224 IW:224 OH:112 OW:112

  1. perf来源:325.35058 mflops ---新:4275.63483 mflops ---加速:13.14162 xiaomi5s
  2. perf来源:3082.00078 mflops ---新:3063.45047 mflops ---加速:0.99398 xiaomi6
  3. perf来源:1761.05058 mflops ---新:1814.37185 mflops ---加速:1.03028 redmi
  4. xiaomi5s中的以下测试。

      

    N:12 IH:48-256 IW:224

    1. N:12 IH:48 IW:224 OH:24 OW:112 perf来源:3721.16633 mflops ---新:4935.31729 mflops ---加速:1.32628
    2. N:12 IH:80 IW:224 OH:40 OW:112 perf来源:1185.58378 mflops ---新:3852.38266 mflops ---加速:3.24936
    3. N:12 IH:112 IW:224 OH:56 OW:112 perf来源:1021.83468 mflops ---新:3503.70672 mflops ---加速:3.42884
    4. N:12 IH:144 IW:224 OH:72 OW:112 perf来源:797.61461 mflops ---新:4167.12780 mflops ---加速:5.22449
    5. N:12 IH:176 IW:224 OH:88 OW:112 perf来源:465.55073 mflops ---新:4084.54206 mflops ---加速:8.77357
    6. N:12 IH:208 IW:224 OH:104 OW:112 perf来源:373.99237 mflops ---新:4255.78687 mflops ---加速:11.37934
    7. N:12 IH:240 IW:224 OH:120 OW:112 perf来源:341.57406 mflops ---新:4290.58840 mflops ---加速:12.56122
    8.   

      N:12 IH:224 IW:48-256

      1. N:12 IH:224 IW:48 OH:112 OW:24 perf来源:3660.35916 mflops ---新:4729.61877 mflops ---加速:1.29212
      2. N:12 IH:224 IW:80 OH:112 OW:40 perf来源:2918.48755 mflops ---新:4748.17285 mflops ---加速:1.62693
      3. N:12 IH:224 IW:112 OH:112 OW:56 perf来源:951.03852 mflops ---新:4051.84318 mflops ---加速:4.26044
      4. N:12 IH:224 IW:144 OH:112 OW:72 perf来源:1186.74405 mflops ---新:4160.18572 mflops ---加速:3.50555
      5. N:12 IH:224 IW:176 OH:112 OW:88 perf来源:533.47286 mflops ---新:4199.36622 mflops ---加速:7.87175
      6. N:12 IH:224 IW:208 OH:112 OW:104 perf来源:447.30682 mflops ---新:4092.22256 mflops ---加速:9.14858
      7. N:12 IH:224 IW:240 OH:112 OW:120 perf来源:442.58206 mflops ---新:4200.13672 mflops ---加速:9.49007
      8.   

        IC:2-12 IH:224 IW:224

        1. N:2 IH:224 IW:224 OH:112 OW:112 perf来源:3794.45684 mflops ---新:5236.48508 mflops ---加速:1.38004
        2. N:3 IH:224 IW:224 OH:112 OW:112 perf来源:3790.20521 mflops ---新:5150.30622 mflops ---加速:1.35885
        3. N:4 IH:224 IW:224 OH:112 OW:112 perf来源:2117.55521 mflops ---新:4329.34274 mflops ---加速:2.04450
        4. N:5 IH:224 IW:224 OH:112 OW:112 perf来源:1290.43541 mflops ---新:3915.65607 mflops ---加速:3.03437
        5. N:6 IH:224 IW:224 OH:112 OW:112 perf来源:1038.86926 mflops ---新:3747.69392 mflops ---加速:3.60747
        6. N:7 IH:224 IW:224 OH:112 OW:112 perf来源:845.26878 mflops ---新:4025.81237 mflops ---加速:4.76276
        7. N:8 IH:224 IW:224 OH:112 OW:112 perf来源:658.23150 mflops ---新:3971.62335 mflops ---加速:6.03378
        8. N:9 IH:224 IW:224 OH:112 OW:112 perf来源:527.99489 mflops ---新:4163.94501 mflops ---加速:7.88634
        9. N:10 IH:224 IW:224 OH:112 OW:112 perf来源:416.75353 mflops ---新:4119.03296 mflops ---加速:9.88362
        10. N:11 IH:224 IW:224 OH:112 OW:112 perf来源:378.38875 mflops ---新:4203.33717 mflops ---加速:11.10852
        11. N:12 IH:224 IW:224 OH:112 OW:112 perf来源:350.36924 mflops ---新:4202.19842 mflops ---加速:11.99363
        12. 我对xiaomi5s中的性能测试感到困惑,为什么xiaomi5上第一个代码的性能如此糟糕。

          我想这可能是因为如果等待正常寄存器(如ld1 {v3.4s, v4.4s}, [x19]等待由x19 add x19, %3, x4计算的clang++ -std=c++11 -Ofast,霓虹灯管道就会被打破,但我是不太确定。

          补充细节:

          1. xiaomi5s cpu:Qualcomm Snapdragon 821
          2. xiaomi6 cpu:Qualcomm Snapdragon 835
          3. redmi cpu:联发科技Helio X20
          4. 编译选项(clang版本:5.0.0):ldr q0, [%2]

            1. 我将ld1 v0.4s, [%2]更改为run_origin,但结果相同,fmla v0.4s, v1.4s, v21.4s的效果可能会快一点,约为1%-3%。
            2.   

              N:12 IH:224 IW:224 OH:112 OW:112

              perf来源:342.96631 mflops --- asm:4288.51646 mflops ---加速:12.50419

              1. 我将smlsl2 v0.2d, v1.4s, v21.4s更改为fmla v0.4s, v1.4s, v21.4s,但结果相同。
                  

                N:12 IH:224 IW:224 OH:112 OW:112

              2. 性能起源:348.03699 mflops --- asm:4245.18804 mflops ---加速:12.19752

                1. 我将fadd v0.4s, v1.4s, v21.4s更改为{{1}},原始代码变得更快。
                2.   

                  N:12 IH:224 IW:224 OH:112 OW:112

                  perf来源:743.95433 mflops --- asm:4756.65769 mflops ---加速:6.39375

2 个答案:

答案 0 :(得分:0)

一个疯狂的猜测是,瓶颈与内核/缓存子系统中的核心一样可能。也许第一种情况会阻止自动预加载(或者xiaomi5缺少此功能或禁用它)?

尝试添加pld(或更确切地说prfm)指令可能会很有趣,尽管我从未发现它们至少对Cortex-A9有很多帮助。

检查fmla是否是瓶颈的简单方法是注释掉部分或全部数据处理指令(当然,输出会出错!)

答案 1 :(得分:-1)

我仍然不像NEON32那样熟悉NEON64,但我的代码中有几件事我不会做:

  • 为什么使用VFP指令" ldr"?在VFP和NEON之间切换可能会花费大量周期,特别是如果这些指令是存储器访问的指令。两者共享寄存器并不意味着它们是相同的单元。将其更改为LD1 ...... 4s

  • 你想要32位还是64位?选择x3或w3,然后坚持下去。

  • 您确定要与fmla融合乘法吗?也许是或者可能没有,但请注意融合倍数会花费更多......

欢呼声