我有一个简单的C代码,可以减去数字'没有。来自两个不同指针的值并写回第三个指针。我使用neon intrinsics尝试使用相同的代码来提高性能,但是我无法看到代码执行时间的任何减少。我正在使用ARM Cortex-A9处理器。
以下是我的C代码:
int code_c(uint8_t *in1, uint8_t *in2, uint8_t *out, uint32_t num)
{
uint32_t i;
for(i = 0; i < (num); i++) {
out[i] = in1[i] - in2[i];
}
return 0;
}
相应的霓虹灯内在代码如下:
#include <arm_neon.h>
int code_neon(uint8_t * __restrict in1, uint8_t * __restrict in2, uint8_t * __restrict y, uint32_t num)
{
uint32_t i;
uint8x8_t s1, s2;
uint8x8_t out;
num = num/8;
for (i = num; i != 0; i--) {
s1 = vld1_u8(in1);
s2 = vld1_u8(in2);
out = vsub_u8(s1, s2);
vst1_u8(y, out);
in1+=8; in2+=8;y+=8;
__builtin_prefetch(in1+8);
__builtin_prefetch(in2+8);
}
return 0;
}
这里出了什么问题?
为Neon生成的汇编代码:
00000000 <code_neon(unsigned char*, unsigned char*, unsigned char*, unsigned int)>:
0: e92d4008 push {r3, lr}
4: e52de004 push {lr} ; (str lr, [sp, #-4]!)
8: ebfffffe bl 0 <__gnu_mcount_nc>
8: R_ARM_CALL __gnu_mcount_nc
c: e1b031a3 lsrs r3, r3, #3
10: 0a00000d beq 4c <code_neon(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x4c>
14: e280e008 add lr, r0, #8
18: e281c008 add ip, r1, #8
1c: f460070f vld1.8 {d16}, [r0]
20: e2533001 subs r3, r3, #1
24: e1a0000e mov r0, lr
28: e28ee008 add lr, lr, #8
2c: f461170f vld1.8 {d17}, [r1]
30: e1a0100c mov r1, ip
34: e28cc008 add ip, ip, #8
38: f5def000 pld [lr]
3c: f34008a1 vsub.i8 d16, d16, d17
40: f5dcf000 pld [ip]
44: f442070d vst1.8 {d16}, [r2]!
48: 1afffff3 bne 1c <code_neon(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1c>
4c: e3a00000 mov r0, #0
50: e8bd8008 pop {r3, pc}
C:
的汇编代码00000000 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)>:
0: e92d43f8 push {r3, r4, r5, r6, r7, r8, r9, lr}
4: e52de004 push {lr} ; (str lr, [sp, #-4]!)
8: ebfffffe bl 0 <__gnu_mcount_nc>
8: R_ARM_CALL __gnu_mcount_nc
c: e3530000 cmp r3, #0
10: 0a0000f1 beq 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
14: e282c010 add ip, r2, #16
18: e280e010 add lr, r0, #16
1c: e152000e cmp r2, lr
20: 3150000c cmpcc r0, ip
24: e2814010 add r4, r1, #16
28: 23a0e001 movcs lr, #1
2c: 33a0e000 movcc lr, #0
30: e1520004 cmp r2, r4
34: 3151000c cmpcc r1, ip
38: 23a0c001 movcs ip, #1
3c: 33a0c000 movcc ip, #0
40: e00cc00e and ip, ip, lr
44: e3530013 cmp r3, #19
48: 93a0c000 movls ip, #0
4c: 820cc001 andhi ip, ip, #1
50: e35c0000 cmp ip, #0
54: 0a0000e2 beq 3e4 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3e4>
58: e200c007 and ip, r0, #7
5c: e26cc000 rsb ip, ip, #0
60: e20cc00f and ip, ip, #15
64: e15c0003 cmp ip, r3
68: 21a0c003 movcs ip, r3
6c: e35c0000 cmp ip, #0
70: 0a000059 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
74: e5d0e000 ldrb lr, [r0]
78: e35c0001 cmp ip, #1
7c: e5d14000 ldrb r4, [r1]
80: e064e00e rsb lr, r4, lr
84: e5c2e000 strb lr, [r2]
88: 0a000053 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
8c: e5d0e001 ldrb lr, [r0, #1]
90: e35c0002 cmp ip, #2
94: e5d14001 ldrb r4, [r1, #1]
98: e064e00e rsb lr, r4, lr
9c: e5c2e001 strb lr, [r2, #1]
a0: 0a00004d beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
a4: e5d0e002 ldrb lr, [r0, #2]
a8: e35c0003 cmp ip, #3
ac: e5d14002 ldrb r4, [r1, #2]
b0: e064e00e rsb lr, r4, lr
b4: e5c2e002 strb lr, [r2, #2]
b8: 0a000047 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
bc: e5d0e003 ldrb lr, [r0, #3]
c0: e35c0004 cmp ip, #4
c4: e5d14003 ldrb r4, [r1, #3]
c8: e064e00e rsb lr, r4, lr
cc: e5c2e003 strb lr, [r2, #3]
d0: 0a000041 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
d4: e5d0e004 ldrb lr, [r0, #4]
d8: e35c0005 cmp ip, #5
dc: e5d14004 ldrb r4, [r1, #4]
e0: e064e00e rsb lr, r4, lr
e4: e5c2e004 strb lr, [r2, #4]
e8: 0a00003b beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
ec: e5d0e005 ldrb lr, [r0, #5]
f0: e35c0006 cmp ip, #6
f4: e5d14005 ldrb r4, [r1, #5]
f8: e064e00e rsb lr, r4, lr
fc: e5c2e005 strb lr, [r2, #5]
100: 0a000035 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
104: e5d0e006 ldrb lr, [r0, #6]
108: e35c0007 cmp ip, #7
10c: e5d14006 ldrb r4, [r1, #6]
110: e064e00e rsb lr, r4, lr
114: e5c2e006 strb lr, [r2, #6]
118: 0a0000be beq 418 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x418>
11c: e5d0e007 ldrb lr, [r0, #7]
120: e35c0008 cmp ip, #8
124: e5d14007 ldrb r4, [r1, #7]
128: e064e00e rsb lr, r4, lr
12c: e5c2e007 strb lr, [r2, #7]
130: 0a000029 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
134: e5d0e008 ldrb lr, [r0, #8]
138: e35c0009 cmp ip, #9
13c: e5d14008 ldrb r4, [r1, #8]
140: e064e00e rsb lr, r4, lr
144: e5c2e008 strb lr, [r2, #8]
148: 0a000023 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
14c: e5d0e009 ldrb lr, [r0, #9]
150: e35c000a cmp ip, #10
154: e5d14009 ldrb r4, [r1, #9]
158: e064e00e rsb lr, r4, lr
15c: e5c2e009 strb lr, [r2, #9]
160: 0a00001d beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
164: e5d0e00a ldrb lr, [r0, #10]
168: e35c000b cmp ip, #11
16c: e5d1400a ldrb r4, [r1, #10]
170: e064e00e rsb lr, r4, lr
174: e5c2e00a strb lr, [r2, #10]
178: 0a000017 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
17c: e5d0e00b ldrb lr, [r0, #11]
180: e35c000c cmp ip, #12
184: e5d1400b ldrb r4, [r1, #11]
188: e064e00e rsb lr, r4, lr
18c: e5c2e00b strb lr, [r2, #11]
190: 0a000011 beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
194: e5d0e00c ldrb lr, [r0, #12]
198: e35c000d cmp ip, #13
19c: e5d1400c ldrb r4, [r1, #12]
1a0: e064e00e rsb lr, r4, lr
1a4: e5c2e00c strb lr, [r2, #12]
1a8: 0a00000b beq 1dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1dc>
1ac: e5d0e00d ldrb lr, [r0, #13]
1b0: e35c000f cmp ip, #15
1b4: e5d1400d ldrb r4, [r1, #13]
1b8: e064e00e rsb lr, r4, lr
1bc: e5c2e00d strb lr, [r2, #13]
1c0: 1a000092 bne 410 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x410>
1c4: e5d0400e ldrb r4, [r0, #14]
1c8: e1a0e00c mov lr, ip
1cc: e5d1500e ldrb r5, [r1, #14]
1d0: e0654004 rsb r4, r5, r4
1d4: e5c2400e strb r4, [r2, #14]
1d8: ea000000 b 1e0 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1e0>
1dc: e1a0e00c mov lr, ip
1e0: e06c6003 rsb r6, ip, r3
1e4: e2435001 sub r5, r3, #1
1e8: e2464010 sub r4, r6, #16
1ec: e06c5005 rsb r5, ip, r5
1f0: e1a04224 lsr r4, r4, #4
1f4: e355000e cmp r5, #14
1f8: e2844001 add r4, r4, #1
1fc: e1a05204 lsl r5, r4, #4
200: 9a000010 bls 248 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x248>
204: e080900c add r9, r0, ip
208: e081800c add r8, r1, ip
20c: e3a07000 mov r7, #0
210: e082c00c add ip, r2, ip
214: f4690adf vld1.64 {d16-d17}, [r9 :64]
218: e2877001 add r7, r7, #1
21c: e1570004 cmp r7, r4
220: e2899010 add r9, r9, #16
224: f4682a0f vld1.8 {d18-d19}, [r8]
228: e2888010 add r8, r8, #16
22c: f34008e2 vsub.i8 q8, q8, q9
230: f44c0a0f vst1.8 {d16-d17}, [ip]
234: e28cc010 add ip, ip, #16
238: 3afffff5 bcc 214 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x214>
23c: e1560005 cmp r6, r5
240: e08ee005 add lr, lr, r5
244: 0a000064 beq 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
248: e7d0c00e ldrb ip, [r0, lr]
24c: e28e4001 add r4, lr, #1
250: e7d1500e ldrb r5, [r1, lr]
254: e1530004 cmp r3, r4
258: e065c00c rsb ip, r5, ip
25c: e7c2c00e strb ip, [r2, lr]
260: 9a00005d bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
264: e7d05004 ldrb r5, [r0, r4]
268: e28ec002 add ip, lr, #2
26c: e7d16004 ldrb r6, [r1, r4]
270: e153000c cmp r3, ip
274: e0665005 rsb r5, r6, r5
278: e7c25004 strb r5, [r2, r4]
27c: 9a000056 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
280: e7d0500c ldrb r5, [r0, ip]
284: e28e4003 add r4, lr, #3
288: e7d1600c ldrb r6, [r1, ip]
28c: e1530004 cmp r3, r4
290: e0665005 rsb r5, r6, r5
294: e7c2500c strb r5, [r2, ip]
298: 9a00004f bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
29c: e7d05004 ldrb r5, [r0, r4]
2a0: e28ec004 add ip, lr, #4
2a4: e7d16004 ldrb r6, [r1, r4]
2a8: e153000c cmp r3, ip
2ac: e0665005 rsb r5, r6, r5
2b0: e7c25004 strb r5, [r2, r4]
2b4: 9a000048 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
2b8: e7d0500c ldrb r5, [r0, ip]
2bc: e28e4005 add r4, lr, #5
2c0: e7d1600c ldrb r6, [r1, ip]
2c4: e1530004 cmp r3, r4
2c8: e0665005 rsb r5, r6, r5
2cc: e7c2500c strb r5, [r2, ip]
2d0: 9a000041 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
2d4: e7d05004 ldrb r5, [r0, r4]
2d8: e28ec006 add ip, lr, #6
2dc: e7d16004 ldrb r6, [r1, r4]
2e0: e153000c cmp r3, ip
2e4: e0665005 rsb r5, r6, r5
2e8: e7c25004 strb r5, [r2, r4]
2ec: 9a00003a bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
2f0: e7d0500c ldrb r5, [r0, ip]
2f4: e28e4007 add r4, lr, #7
2f8: e7d1600c ldrb r6, [r1, ip]
2fc: e1530004 cmp r3, r4
300: e0665005 rsb r5, r6, r5
304: e7c2500c strb r5, [r2, ip]
308: 9a000033 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
30c: e7d05004 ldrb r5, [r0, r4]
310: e28ec008 add ip, lr, #8
314: e7d16004 ldrb r6, [r1, r4]
318: e153000c cmp r3, ip
31c: e0665005 rsb r5, r6, r5
320: e7c25004 strb r5, [r2, r4]
324: 9a00002c bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
328: e7d0500c ldrb r5, [r0, ip]
32c: e28e4009 add r4, lr, #9
330: e7d1600c ldrb r6, [r1, ip]
334: e1530004 cmp r3, r4
338: e0665005 rsb r5, r6, r5
33c: e7c2500c strb r5, [r2, ip]
340: 9a000025 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
344: e7d05004 ldrb r5, [r0, r4]
348: e28ec00a add ip, lr, #10
34c: e7d16004 ldrb r6, [r1, r4]
350: e153000c cmp r3, ip
354: e0665005 rsb r5, r6, r5
358: e7c25004 strb r5, [r2, r4]
35c: 9a00001e bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
360: e7d0500c ldrb r5, [r0, ip]
364: e28e400b add r4, lr, #11
368: e7d1600c ldrb r6, [r1, ip]
36c: e1530004 cmp r3, r4
370: e0665005 rsb r5, r6, r5
374: e7c2500c strb r5, [r2, ip]
378: 9a000017 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
37c: e7d05004 ldrb r5, [r0, r4]
380: e28ec00c add ip, lr, #12
384: e7d16004 ldrb r6, [r1, r4]
388: e153000c cmp r3, ip
38c: e0665005 rsb r5, r6, r5
390: e7c25004 strb r5, [r2, r4]
394: 9a000010 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
398: e7d0500c ldrb r5, [r0, ip]
39c: e28e400d add r4, lr, #13
3a0: e7d1600c ldrb r6, [r1, ip]
3a4: e1530004 cmp r3, r4
3a8: e0665005 rsb r5, r6, r5
3ac: e7c2500c strb r5, [r2, ip]
3b0: 9a000009 bls 3dc <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3dc>
3b4: e7d05004 ldrb r5, [r0, r4]
3b8: e28ec00e add ip, lr, #14
3bc: e7d1e004 ldrb lr, [r1, r4]
3c0: e153000c cmp r3, ip
3c4: e06e3005 rsb r3, lr, r5
3c8: e7c23004 strb r3, [r2, r4]
3cc: 87d0300c ldrbhi r3, [r0, ip]
3d0: 87d1100c ldrbhi r1, [r1, ip]
3d4: 80613003 rsbhi r3, r1, r3
3d8: 87c2300c strbhi r3, [r2, ip]
3dc: e3a00000 mov r0, #0
3e0: e8bd83f8 pop {r3, r4, r5, r6, r7, r8, r9, pc}
3e4: e2411001 sub r1, r1, #1
3e8: e0803003 add r3, r0, r3
3ec: e2422001 sub r2, r2, #1
3f0: e4d0c001 ldrb ip, [r0], #1
3f4: e5f1e001 ldrb lr, [r1, #1]!
3f8: e1500003 cmp r0, r3
3fc: e06ec00c rsb ip, lr, ip
400: e5e2c001 strb ip, [r2, #1]!
404: 1afffff9 bne 3f0 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x3f0>
408: e3a00000 mov r0, #0
40c: e8bd83f8 pop {r3, r4, r5, r6, r7, r8, r9, pc}
410: e3a0e00e mov lr, #14
414: eaffff71 b 1e0 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1e0>
418: e3a0e007 mov lr, #7
41c: eaffff6f b 1e0 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x1e0>
答案 0 :(得分:2)
编译器写了这个(埋在很多设置代码中以处理边缘情况):
214: f4690adf vld1.64 {d16-d17}, [r9 :64]
218: e2877001 add r7, r7, #1
21c: e1570004 cmp r7, r4
220: e2899010 add r9, r9, #16
224: f4682a0f vld1.8 {d18-d19}, [r8]
228: e2888010 add r8, r8, #16
22c: f34008e2 vsub.i8 q8, q8, q9
230: f44c0a0f vst1.8 {d16-d17}, [ip]
234: e28cc010 add ip, ip, #16
238: 3afffff5 bcc 214 <code_c(unsigned char*, unsigned char*, unsigned char*, unsigned int)+0x214>
我的NEON非常生疏,我不打算在这里解码每一行(我错了;希望有人会提供更完整的答案),但这是在16个字节加载时间进入两个128位寄存器,并行地减去所有16个寄存器,然后将它们全部写回目标。所以这就是你正在寻找的矢量化。您的NEON代码可能比编译器稍微快一些的事实至少部分是因为您没有检查n
不能被8整除的边缘情况。编译器会这样做。
在大多数情况下,内在函数不会有所帮助。如果您试图击败编译器,您将需要自己处理整个管道,并且内在函数不够强大。您需要能够选择寄存器,决定何时读取和写入内存,并且非常谨慎地管理数据布局,甚至开始打败编译器(因为它已经完成了所有这些)。
为什么编译器的并行行为通常比手写更好,即使你写的内容基本相同?那么,你如何管理故障?装配说明不是串联运行的;其中许多并行运行。通常,当您执行昂贵的指令时,您可能无法在几个时钟周期内读取结果。如果您尝试,处理器必须停止并等待。为了避免这个问题,你经常以非常奇怪的顺序编写汇编,例如&#34;开始计算,加载下一个数据,写入计算结果。&#34;用内在函数很难实现。
您在@ yeoman的答案中的一些评论:
绝对不是。执行时间取决于执行的汇编指令的数量以及这些指令的执行顺序和执行顺序。非常,经常(几乎总是)更快的代码是装配中更长的代码。 (当然不承诺反过来......)最着名的例子是循环展开。连续3次切割和粘贴操作将比计数为3的循环更快。仅在那里避免分支就会很大。因此,当编译器事先知道迭代次数时,编译器会自动展开小循环。
应该有8个并行操作,编译器会生成它。但是你的代码并没有;它一次做一个。
使用NEON确实让它变得更快;编译器已经使用了NEON。
对于略有不同的问题(讨论iOS中的Accelerate框架),但仍然解决相同的基本问题,请参阅Introduction to Fast Bezier。
同样重申@ yeoman的观点:如果一些非常简单和机械的改变可以使你的C代码变得更快,编译器就会为你做这件事(而且确实如此)。
答案 1 :(得分:1)
我猜你的C编译器,更具体地说是它的ARM后端,知道你正在编译的ARM架构的一些事情。你的例子是非常简单和普通的,可能有一个优化:)
手动优化代码最适用于奇怪且不常见的情况,因此编译器只是放弃并使用实际的非展开循环对线性代码进行简单的1:1转换,当然可以通过以下方式大大改进手动优化:)