与C代码

时间:2016-10-15 10:06:48

标签: c simd neon

我有一个简单的C代码,可以减去数字'没有。来自两个不同指针的值并写回第三个指针。我使用neon intrinsics尝试使用相同的代码来提高性能,但是我无法看到代码执行时间的任何减少。我正在使用ARM Cortex-A9处理器。

以下是我的C代码:

int code_c(uint8_t *in1, uint8_t *in2, uint8_t *out, uint32_t num)
{
  uint32_t i;

  for(i = 0; i < (num); i++) {

    out[i] = in1[i] - in2[i];
  }
  return 0;
}

相应的霓虹灯内在代码如下:

#include <arm_neon.h>

int code_neon(uint8_t * __restrict in1, uint8_t * __restrict    in2, uint8_t * __restrict y, uint32_t num)
{
  uint32_t   i;
  uint8x8_t s1, s2;
  uint8x8_t out;

  num = num/8;

  for (i = num; i != 0; i--) {

    s1 = vld1_u8(in1);
    s2 = vld1_u8(in2);

    out  = vsub_u8(s1, s2);
    vst1_u8(y, out);

    in1+=8; in2+=8;y+=8;

    __builtin_prefetch(in1+8);
    __builtin_prefetch(in2+8);
   }
   return 0;
}

这里出了什么问题?

为Neon生成的汇编代码:

00000000 <code_neon(unsigned char*, unsigned char*, unsigned char*, unsigned int)>:
0:  e92d4008    push    {r3, lr}
4:  e52de004    push    {lr}        ; (str lr, [sp, #-4]!)
  8:    ebfffffe    bl  0 <__gnu_mcount_nc>
        8: R_ARM_CALL   __gnu_mcount_nc
  c:    e1b031a3    lsrs    r3, r3, #3
10: 0a00000d    beq 4c <code_neon(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x4c>
14: e280e008    add lr, r0, #8
18: e281c008    add ip, r1, #8
1c: f460070f    vld1.8  {d16}, [r0]
20: e2533001    subs    r3, r3, #1
24: e1a0000e    mov r0, lr
28: e28ee008    add lr, lr, #8
2c: f461170f    vld1.8  {d17}, [r1]
30: e1a0100c    mov r1, ip
34: e28cc008    add ip, ip, #8
38: f5def000    pld [lr]
3c: f34008a1    vsub.i8 d16, d16, d17
40: f5dcf000    pld [ip]
44: f442070d    vst1.8  {d16}, [r2]!
48: 1afffff3    bne 1c <code_neon(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1c>
4c: e3a00000    mov r0, #0
50: e8bd8008    pop {r3, pc}

C:

的汇编代码
00000000 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)>:
0:  e92d43f8    push    {r3, r4, r5, r6, r7, r8, r9, lr}
4:  e52de004    push    {lr}        ; (str lr, [sp, #-4]!)
8:  ebfffffe    bl  0 <__gnu_mcount_nc>
        8: R_ARM_CALL   __gnu_mcount_nc
c:  e3530000    cmp r3, #0
10: 0a0000f1    beq 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
14: e282c010    add ip, r2, #16
18: e280e010    add lr, r0, #16
1c: e152000e    cmp r2, lr
20: 3150000c    cmpcc   r0, ip
24: e2814010    add r4, r1, #16
28: 23a0e001    movcs   lr, #1
2c: 33a0e000    movcc   lr, #0
30: e1520004    cmp r2, r4
34: 3151000c    cmpcc   r1, ip
38: 23a0c001    movcs   ip, #1
3c: 33a0c000    movcc   ip, #0
40: e00cc00e    and ip, ip, lr
44: e3530013    cmp r3, #19
48: 93a0c000    movls   ip, #0
4c: 820cc001    andhi   ip, ip, #1
50: e35c0000    cmp ip, #0
54: 0a0000e2    beq 3e4 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3e4>
58: e200c007    and ip, r0, #7
5c: e26cc000    rsb ip, ip, #0
60: e20cc00f    and ip, ip, #15
64: e15c0003    cmp ip, r3
68: 21a0c003    movcs   ip, r3
6c: e35c0000    cmp ip, #0
70: 0a000059    beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
74: e5d0e000    ldrb    lr, [r0]
78: e35c0001    cmp ip, #1
7c: e5d14000    ldrb    r4, [r1]
80: e064e00e    rsb lr, r4, lr
84: e5c2e000    strb    lr, [r2]
88: 0a000053    beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
8c: e5d0e001    ldrb    lr, [r0, #1]
90: e35c0002    cmp ip, #2
94: e5d14001    ldrb    r4, [r1, #1]
98: e064e00e    rsb lr, r4, lr
9c: e5c2e001    strb    lr, [r2, #1]
a0: 0a00004d    beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
a4: e5d0e002    ldrb    lr, [r0, #2]
a8: e35c0003    cmp ip, #3
ac: e5d14002    ldrb    r4, [r1, #2]
b0: e064e00e    rsb lr, r4, lr
b4: e5c2e002    strb    lr, [r2, #2]
b8: 0a000047    beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
bc: e5d0e003    ldrb    lr, [r0, #3]
c0: e35c0004    cmp ip, #4
c4: e5d14003    ldrb    r4, [r1, #3]
c8: e064e00e    rsb lr, r4, lr
cc: e5c2e003    strb    lr, [r2, #3]
d0: 0a000041    beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
d4: e5d0e004    ldrb    lr, [r0, #4]
d8: e35c0005    cmp ip, #5
dc: e5d14004    ldrb    r4, [r1, #4]
e0: e064e00e    rsb lr, r4, lr
e4: e5c2e004    strb    lr, [r2, #4]
e8: 0a00003b    beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
ec: e5d0e005    ldrb    lr, [r0, #5]
f0: e35c0006    cmp ip, #6
f4: e5d14005    ldrb    r4, [r1, #5]
f8: e064e00e    rsb lr, r4, lr
fc: e5c2e005    strb    lr, [r2, #5]
100:    0a000035    beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
104:    e5d0e006    ldrb    lr, [r0, #6]
108:    e35c0007    cmp ip, #7
10c:    e5d14006    ldrb    r4, [r1, #6]
110:    e064e00e    rsb lr, r4, lr
114:    e5c2e006    strb    lr, [r2, #6]
118:    0a0000be    beq 418 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x418>
11c:    e5d0e007    ldrb    lr, [r0, #7]
120:    e35c0008    cmp ip, #8
124:    e5d14007    ldrb    r4, [r1, #7]
128:    e064e00e    rsb lr, r4, lr
12c:    e5c2e007    strb    lr, [r2, #7]
130:    0a000029    beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
134:    e5d0e008    ldrb    lr, [r0, #8]
138:    e35c0009    cmp ip, #9
13c:    e5d14008    ldrb    r4, [r1, #8]
140:    e064e00e    rsb lr, r4, lr
144:    e5c2e008    strb    lr, [r2, #8]
148:    0a000023    beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
14c:    e5d0e009    ldrb    lr, [r0, #9]
150:    e35c000a    cmp ip, #10
154:    e5d14009    ldrb    r4, [r1, #9]
158:    e064e00e    rsb lr, r4, lr
15c:    e5c2e009    strb    lr, [r2, #9]
160:    0a00001d    beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
164:    e5d0e00a    ldrb    lr, [r0, #10]
168:    e35c000b    cmp ip, #11
16c:    e5d1400a    ldrb    r4, [r1, #10]
170:    e064e00e    rsb lr, r4, lr
174:    e5c2e00a    strb    lr, [r2, #10]
178:    0a000017    beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
17c:    e5d0e00b    ldrb    lr, [r0, #11]
180:    e35c000c    cmp ip, #12
184:    e5d1400b    ldrb    r4, [r1, #11]
188:    e064e00e    rsb lr, r4, lr
18c:    e5c2e00b    strb    lr, [r2, #11]
190:    0a000011    beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
194:    e5d0e00c    ldrb    lr, [r0, #12]
198:    e35c000d    cmp ip, #13
19c:    e5d1400c    ldrb    r4, [r1, #12]
1a0:    e064e00e    rsb lr, r4, lr
1a4:    e5c2e00c    strb    lr, [r2, #12]
1a8:    0a00000b    beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
1ac:    e5d0e00d    ldrb    lr, [r0, #13]
1b0:    e35c000f    cmp ip, #15
1b4:    e5d1400d    ldrb    r4, [r1, #13]
1b8:    e064e00e    rsb lr, r4, lr
1bc:    e5c2e00d    strb    lr, [r2, #13]
1c0:    1a000092    bne 410 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x410>
1c4:    e5d0400e    ldrb    r4, [r0, #14]
1c8:    e1a0e00c    mov lr, ip
1cc:    e5d1500e    ldrb    r5, [r1, #14]
1d0:    e0654004    rsb r4, r5, r4
1d4:    e5c2400e    strb    r4, [r2, #14]
1d8:    ea000000    b   1e0 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1e0>
1dc:    e1a0e00c    mov lr, ip
1e0:    e06c6003    rsb r6, ip, r3
1e4:    e2435001    sub r5, r3, #1
1e8:    e2464010    sub r4, r6, #16
1ec:    e06c5005    rsb r5, ip, r5
1f0:    e1a04224    lsr r4, r4, #4
1f4:    e355000e    cmp r5, #14
1f8:    e2844001    add r4, r4, #1
1fc:    e1a05204    lsl r5, r4, #4
200:    9a000010    bls 248 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x248>
204:    e080900c    add r9, r0, ip
208:    e081800c    add r8, r1, ip
20c:    e3a07000    mov r7, #0
210:    e082c00c    add ip, r2, ip
214:    f4690adf    vld1.64 {d16-d17}, [r9 :64]
218:    e2877001    add r7, r7, #1
21c:    e1570004    cmp r7, r4
220:    e2899010    add r9, r9, #16
224:    f4682a0f    vld1.8  {d18-d19}, [r8]
228:    e2888010    add r8, r8, #16
22c:    f34008e2    vsub.i8 q8, q8, q9
230:    f44c0a0f    vst1.8  {d16-d17}, [ip]
234:    e28cc010    add ip, ip, #16
238:    3afffff5    bcc 214 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x214>
23c:    e1560005    cmp r6, r5
240:    e08ee005    add lr, lr, r5
244:    0a000064    beq 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
248:    e7d0c00e    ldrb    ip, [r0, lr]
24c:    e28e4001    add r4, lr, #1
250:    e7d1500e    ldrb    r5, [r1, lr]
254:    e1530004    cmp r3, r4
258:    e065c00c    rsb ip, r5, ip
25c:    e7c2c00e    strb    ip, [r2, lr]
260:    9a00005d    bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
264:    e7d05004    ldrb    r5, [r0, r4]
268:    e28ec002    add ip, lr, #2
26c:    e7d16004    ldrb    r6, [r1, r4]
270:    e153000c    cmp r3, ip
274:    e0665005    rsb r5, r6, r5
278:    e7c25004    strb    r5, [r2, r4]
27c:    9a000056    bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
280:    e7d0500c    ldrb    r5, [r0, ip]
284:    e28e4003    add r4, lr, #3
288:    e7d1600c    ldrb    r6, [r1, ip]
28c:    e1530004    cmp r3, r4
290:    e0665005    rsb r5, r6, r5
294:    e7c2500c    strb    r5, [r2, ip]
298:    9a00004f    bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
29c:    e7d05004    ldrb    r5, [r0, r4]
2a0:    e28ec004    add ip, lr, #4
2a4:    e7d16004    ldrb    r6, [r1, r4]
2a8:    e153000c    cmp r3, ip
2ac:    e0665005    rsb r5, r6, r5
2b0:    e7c25004    strb    r5, [r2, r4]
2b4:    9a000048    bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
2b8:    e7d0500c    ldrb    r5, [r0, ip]
2bc:    e28e4005    add r4, lr, #5
2c0:    e7d1600c    ldrb    r6, [r1, ip]
2c4:    e1530004    cmp r3, r4
2c8:    e0665005    rsb r5, r6, r5
2cc:    e7c2500c    strb    r5, [r2, ip]
2d0:    9a000041    bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
2d4:    e7d05004    ldrb    r5, [r0, r4]
2d8:    e28ec006    add ip, lr, #6
2dc:    e7d16004    ldrb    r6, [r1, r4]
2e0:    e153000c    cmp r3, ip
2e4:    e0665005    rsb r5, r6, r5
2e8:    e7c25004    strb    r5, [r2, r4]
2ec:    9a00003a    bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
2f0:    e7d0500c    ldrb    r5, [r0, ip]
2f4:    e28e4007    add r4, lr, #7
2f8:    e7d1600c    ldrb    r6, [r1, ip]
2fc:    e1530004    cmp r3, r4
300:    e0665005    rsb r5, r6, r5
304:    e7c2500c    strb    r5, [r2, ip]
 308:   9a000033    bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
 30c:   e7d05004    ldrb    r5, [r0, r4]
 310:   e28ec008    add ip, lr, #8
 314:   e7d16004    ldrb    r6, [r1, r4]
 318:   e153000c    cmp r3, ip
 31c:   e0665005    rsb r5, r6, r5
 320:   e7c25004    strb    r5, [r2, r4]
 324:   9a00002c    bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
 328:   e7d0500c    ldrb    r5, [r0, ip]
 32c:   e28e4009    add r4, lr, #9
 330:   e7d1600c    ldrb    r6, [r1, ip]
 334:   e1530004    cmp r3, r4
 338:   e0665005    rsb r5, r6, r5
 33c:   e7c2500c    strb    r5, [r2, ip]
 340:   9a000025    bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
 344:   e7d05004    ldrb    r5, [r0, r4]
 348:   e28ec00a    add ip, lr, #10
 34c:   e7d16004    ldrb    r6, [r1, r4]
 350:   e153000c    cmp r3, ip
 354:   e0665005    rsb r5, r6, r5
 358:   e7c25004    strb    r5, [r2, r4]
 35c:   9a00001e    bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
 360:   e7d0500c    ldrb    r5, [r0, ip]
 364:   e28e400b    add r4, lr, #11
 368:   e7d1600c    ldrb    r6, [r1, ip]
 36c:   e1530004    cmp r3, r4
 370:   e0665005    rsb r5, r6, r5
 374:   e7c2500c    strb    r5, [r2, ip]
 378:   9a000017    bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
 37c:   e7d05004    ldrb    r5, [r0, r4]
 380:   e28ec00c    add ip, lr, #12
 384:   e7d16004    ldrb    r6, [r1, r4]
 388:   e153000c    cmp r3, ip
 38c:   e0665005    rsb r5, r6, r5
 390:   e7c25004    strb    r5, [r2, r4]
 394:   9a000010    bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
 398:   e7d0500c    ldrb    r5, [r0, ip]
 39c:   e28e400d    add r4, lr, #13
 3a0:   e7d1600c    ldrb    r6, [r1, ip]
 3a4:   e1530004    cmp r3, r4
 3a8:   e0665005    rsb r5, r6, r5
 3ac:   e7c2500c    strb    r5, [r2, ip]
 3b0:   9a000009    bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
 3b4:   e7d05004    ldrb    r5, [r0, r4]
 3b8:   e28ec00e    add ip, lr, #14
 3bc:   e7d1e004    ldrb    lr, [r1, r4]
 3c0:   e153000c    cmp r3, ip
 3c4:   e06e3005    rsb r3, lr, r5
 3c8:   e7c23004    strb    r3, [r2, r4]
 3cc:   87d0300c    ldrbhi  r3, [r0, ip]
 3d0:   87d1100c    ldrbhi  r1, [r1, ip]
 3d4:   80613003    rsbhi   r3, r1, r3
 3d8:   87c2300c    strbhi  r3, [r2, ip]
 3dc:   e3a00000    mov r0, #0
 3e0:   e8bd83f8    pop {r3, r4, r5, r6, r7, r8, r9, pc}
 3e4:   e2411001    sub r1, r1, #1
 3e8:   e0803003    add r3, r0, r3
 3ec:   e2422001    sub r2, r2, #1
 3f0:   e4d0c001    ldrb    ip, [r0], #1
 3f4:   e5f1e001    ldrb    lr, [r1, #1]!
 3f8:   e1500003    cmp r0, r3
 3fc:   e06ec00c    rsb ip, lr, ip
 400:   e5e2c001    strb    ip, [r2, #1]!
 404:   1afffff9    bne 3f0 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3f0>
 408:   e3a00000    mov r0, #0
 40c:   e8bd83f8    pop {r3, r4, r5, r6, r7, r8, r9, pc}
 410:   e3a0e00e    mov lr, #14
 414:   eaffff71    b   1e0 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1e0>
 418:   e3a0e007    mov lr, #7
 41c:   eaffff6f    b   1e0 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1e0>

2 个答案:

答案 0 :(得分:2)

编译器写了这个(埋在很多设置代码中以处理边缘情况):

214:    f4690adf    vld1.64 {d16-d17}, [r9 :64]
218:    e2877001    add r7, r7, #1
21c:    e1570004    cmp r7, r4
220:    e2899010    add r9, r9, #16
224:    f4682a0f    vld1.8  {d18-d19}, [r8]
228:    e2888010    add r8, r8, #16
22c:    f34008e2    vsub.i8 q8, q8, q9
230:    f44c0a0f    vst1.8  {d16-d17}, [ip]
234:    e28cc010    add ip, ip, #16
238:    3afffff5    bcc 214 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x214>

我的NEON非常生疏,我不打算在这里解码每一行(我错了;希望有人会提供更完整的答案),但这是在16个字节加载时间进入两个128位寄存器,并行地减去所有16个寄存器,然后将它们全部写回目标。所以这就是你正在寻找的矢量化。您的NEON代码可能比编译器稍微快一些的事实至少部分是因为您没有检查n不能被8整除的边缘情况。编译器会这样做。

在大多数情况下,内在函数不会有所帮助。如果您试图击败编译器,您将需要自己处理整个管道,并且内在函数不够强大。您需要能够选择寄存器,决定何时读取和写入内存,并且非常谨慎地管理数据布局,甚至开始打败编译器(因为它已经完成了所有这些)。

为什么编译器的并行行为通常比手写更好,即使你写的内容基本相同?那么,你如何管理故障?装配说明不是串联运行的;其中许多并行运行。通常,当您执行昂贵的指令时,您可能无法在几个时钟周期内读取结果。如果您尝试,处理器必须停止并等待。为了避免这个问题,你经常以非常奇怪的顺序编写汇编,例如&#34;开始计算,加载下一个数据,写入计算结果。&#34;用内在函数很难实现。

您在@ yeoman的答案中的一些评论:

  1. 执行时间是否取决于生成的汇编指令数?
  2. 绝对不是。执行时间取决于执行的汇编指令的数量以及这些指令的执行顺序和执行顺序。非常,经常(几乎总是)更快的代码是装配中更长的代码。 (当然不承诺反过来......)最着名的例子是循环展开。连续3次切割和粘贴操作将比计数为3的循环更快。仅在那里避免分支就会很大。因此,当编译器事先知道迭代次数时,编译器会自动展开小循环。

    1. 与C上的单身相比,Neon应该有8个并行操作。
    2. 应该有8个并行操作,编译器会生成它。但是你的代码并没有;它一次做一个。

      使用NEON确实让它变得更快;编译器已经使用了NEON。

      对于略有不同的问题(讨论iOS中的Accelerate框架),但仍然解决相同的基本问题,请参阅Introduction to Fast Bezier

      同样重申@ yeoman的观点:如果一些非常简单和机械的改变可以使你的C代码变得更快,编译器就会为你做这件事(而且确实如此)。

答案 1 :(得分:1)

我猜你的C编译器,更具体地说是它的ARM后端,知道你正在编译的ARM架构的一些事情。你的例子是非常简单和普通的,可能有一个优化:)

手动优化代码最适用于奇怪且不常见的情况,因此编译器只是放弃并使用实际的非展开循环对线性代码进行简单的1:1转换,当然可以通过以下方式大大改进手动优化:)