所以我正在尝试用纯粹的装配来优化这个C函数,该函数需要2个图像并逐像素地输出它在灰度级中的最大差异:
unsigned short infNorm(unsigned char x1, unsigned char y1, unsigned char z1, unsigned char x2, unsigned char y2, unsigned char z2 ){
short x = abs(x1-x2);
short y = abs(y1-y2);
short z = abs(z1-z2);
if(x <= y){
return y > z ? y : z;
}else{
return x > z ? x : z;
}
}
void diff_c (unsigned char *src, unsigned char *src_2, unsigned char *dst, int m, int n, int src_row_size, int src_2_row_size, int dst_row_size) {
unsigned char (*src_matrix)[src_row_size] = (unsigned char (*)[src_row_size]) src;
unsigned char (*src_2_matrix)[src_2_row_size] = (unsigned char (*)[src_2_row_size]) src_2;
unsigned char (*dst_matrix)[dst_row_size] = (unsigned char (*)[dst_row_size]) dst;
unsigned char n1;
for(int y = 0; y < n; y++){
for(int x = 0; x < m; ++x ){
n1 = infNorm(src_matrix[y][x*4], src_matrix[y][x*4+1], src_matrix[y][x*4+2],
src_2_matrix[y][x*4], src_2_matrix[y][x*4+1], src_2_matrix[y][x*4+2]);
dst_matrix[y][x*4] = n1; //blue
dst_matrix[y][x*4+1] = n1; //red
dst_matrix[y][x*4+2] = n1; //green
dst_matrix[y][x*4+3] = 255; //alpha
}
}
}
我使用SIMD做了这个,并且得到的函数比任何优化标志上gcc的任何输出都要快得多。现在,我想做同样的事情,但不使用任何SSE指令。对于这个我objdumped gcc的构建并隔离了执行这两个函数的部分(gcc -O3),以便对gcc做的事情有所了解:
0000000000402280 <infNorm>:
402280: 40 0f b6 ff movzx edi,dil
402284: 0f b6 c9 movzx ecx,cl
402287: 40 0f b6 f6 movzx esi,sil
40228b: 29 cf sub edi,ecx
40228d: 45 0f b6 c0 movzx r8d,r8b
402291: 0f b6 d2 movzx edx,dl
402294: 89 f8 mov eax,edi
402296: 44 29 c6 sub esi,r8d
402299: 45 0f b6 c9 movzx r9d,r9b
40229d: c1 f8 1f sar eax,0x1f
4022a0: 44 29 ca sub edx,r9d
4022a3: 31 c7 xor edi,eax
4022a5: 29 c7 sub edi,eax
4022a7: 89 f0 mov eax,esi
4022a9: c1 f8 1f sar eax,0x1f
4022ac: 31 c6 xor esi,eax
4022ae: 29 c6 sub esi,eax
4022b0: 89 d0 mov eax,edx
4022b2: c1 f8 1f sar eax,0x1f
4022b5: 31 c2 xor edx,eax
4022b7: 29 c2 sub edx,eax
4022b9: 66 39 f7 cmp di,si
4022bc: 7e 12 jle 4022d0 <infNorm+0x50>
4022be: 66 39 d7 cmp di,dx
4022c1: 89 d0 mov eax,edx
4022c3: 0f 4d c7 cmovge eax,edi
4022c6: c3 ret
4022c7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
4022ce: 00 00
4022d0: 66 39 d6 cmp si,dx
4022d3: 89 d0 mov eax,edx
4022d5: 0f 4d c6 cmovge eax,esi
4022d8: c3 ret
4022d9: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
00000000004022e0 <diff_c>:
4022e0: 45 85 c0 test r8d,r8d
4022e3: 0f 8e 05 01 00 00 jle 4023ee <diff_c+0x10e>
4022e9: 41 57 push r15
4022eb: 45 31 ff xor r15d,r15d
4022ee: 41 56 push r14
4022f0: 44 8d 34 8d 00 00 00 lea r14d,[rcx*4+0x0]
4022f7: 00
4022f8: 41 55 push r13
4022fa: 41 54 push r12
4022fc: 49 89 f4 mov r12,rsi
4022ff: 55 push rbp
402300: 48 89 fd mov rbp,rdi
402303: 53 push rbx
402304: 48 63 44 24 38 movsxd rax,DWORD PTR [rsp+0x38]
402309: 48 89 44 24 e8 mov QWORD PTR [rsp-0x18],rax
40230e: 49 63 c1 movsxd rax,r9d
402311: 48 89 44 24 f8 mov QWORD PTR [rsp-0x8],rax
402316: 48 63 44 24 40 movsxd rax,DWORD PTR [rsp+0x40]
40231b: 48 89 44 24 f0 mov QWORD PTR [rsp-0x10],rax
402320: 85 c9 test ecx,ecx
402322: 0f 8e a0 00 00 00 jle 4023c8 <diff_c+0xe8>
402328: 31 c0 xor eax,eax
40232a: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
402330: 48 63 d8 movsxd rbx,eax
402333: 44 8d 58 01 lea r11d,[rax+0x1]
402337: 44 8d 50 02 lea r10d,[rax+0x2]
40233b: 41 0f b6 34 1c movzx esi,BYTE PTR [r12+rbx*1]
402340: 44 0f b6 4c 1d 00 movzx r9d,BYTE PTR [rbp+rbx*1+0x0]
402346: 4d 63 db movsxd r11,r11d
402349: 4d 63 d2 movsxd r10,r10d
40234c: 42 0f b6 7c 1d 00 movzx edi,BYTE PTR [rbp+r11*1+0x0]
402352: 47 0f b6 2c 14 movzx r13d,BYTE PTR [r12+r10*1]
402357: 41 29 f1 sub r9d,esi
40235a: 44 89 ce mov esi,r9d
40235d: c1 fe 1f sar esi,0x1f
402360: 41 31 f1 xor r9d,esi
402363: 41 29 f1 sub r9d,esi
402366: 43 0f b6 34 1c movzx esi,BYTE PTR [r12+r11*1]
40236b: 29 f7 sub edi,esi
40236d: 89 fe mov esi,edi
40236f: c1 fe 1f sar esi,0x1f
402372: 31 f7 xor edi,esi
402374: 29 f7 sub edi,esi
402376: 42 0f b6 74 15 00 movzx esi,BYTE PTR [rbp+r10*1+0x0]
40237c: 44 29 ee sub esi,r13d
40237f: 41 89 f5 mov r13d,esi
402382: 41 c1 fd 1f sar r13d,0x1f
402386: 44 31 ee xor esi,r13d
402389: 44 29 ee sub esi,r13d
40238c: 41 89 fd mov r13d,edi
40238f: 66 39 fe cmp si,di
402392: 44 0f 4d ee cmovge r13d,esi
402396: 66 44 39 ce cmp si,r9w
40239a: 41 0f 4c f1 cmovl esi,r9d
40239e: 66 41 39 f9 cmp r9w,di
4023a2: 41 0f 4e f5 cmovle esi,r13d
4023a6: 40 88 34 1a mov BYTE PTR [rdx+rbx*1],sil
4023aa: 42 88 34 1a mov BYTE PTR [rdx+r11*1],sil
4023ae: 42 88 34 12 mov BYTE PTR [rdx+r10*1],sil
4023b2: 8d 70 03 lea esi,[rax+0x3]
4023b5: 83 c0 04 add eax,0x4
4023b8: 44 39 f0 cmp eax,r14d
4023bb: 48 63 f6 movsxd rsi,esi
4023be: c6 04 32 ff mov BYTE PTR [rdx+rsi*1],0xff
4023c2: 0f 85 68 ff ff ff jne 402330 <diff_c+0x50>
4023c8: 41 83 c7 01 add r15d,0x1
4023cc: 4c 03 64 24 e8 add r12,QWORD PTR [rsp-0x18]
4023d1: 48 03 6c 24 f8 add rbp,QWORD PTR [rsp-0x8]
4023d6: 48 03 54 24 f0 add rdx,QWORD PTR [rsp-0x10]
4023db: 45 39 c7 cmp r15d,r8d
4023de: 0f 85 3c ff ff ff jne 402320 <diff_c+0x40>
4023e4: 5b pop rbx
4023e5: 5d pop rbp
4023e6: 41 5c pop r12
4023e8: 41 5d pop r13
4023ea: 41 5e pop r14
4023ec: 41 5f pop r15
4023ee: f3 c3 repz ret
所以考虑到这一点,与gcc正在做的事情相反的想法是尽可能避免访问内存并使用类似于gcc的东西我将循环展开了一次并编写了这个:
diff_asm:
push rbp
push r12
push r13
push r14
push r15
push rbx
sub rsp, 8
mov r15, rdx
mov eax, r8d
mov ecx, ecx
mul rcx
mov rcx, rax
mov rdx, r15
.cicle:
cmp rcx, 0
je .tend
mov rbx, [rdi + rcx*4 - 8]
movzx r12, bl
shr rbx, 8
movzx r13, bl
shr rbx, 8
movzx r14, bl
shr rbx, 16
mov r15, [rsi + rcx*4 - 8]
movzx rbp, r15b
shr r15, 8
movzx r8, r15b
shr r15, 8
movzx r9, r15b
shr r15, 16
call inf_norm
movzx r12, bl
shr rbx, 8
movzx r13, bl
shr rbx, 8
movzx r14, bl
shr rbx, 8
movzx rbp, r15b
shr r15, 8
movzx r8, r15b
shr r15, 8
movzx r9, r15b
shr r15, 8
call inf_norm
mov [rdx + rcx*4 - 8], rax
sub rcx, 2
jmp .cicle
.tend:
add rsp, 8
pop rbx
pop r15
pop r14
pop r13
pop r12
pop rbp
ret
; r12w = x_1
; r13w = y_1
; r14w = z_1
; rbp = x_2
; r8 = y_2
; r9 = z_3
inf_norm:
sub r12d, ebp
mov ebp, r12d
sar ebp, 0x1f
xor r12d, ebp
sub r12d, ebp
sub r13d, r8d
mov r8d, r13d
sar r8d, 0x1f
xor r13d, r8d
sub r13d, r8d
sub r14d, r9d
mov r9d, r14d
sar r9d, 0x1f
xor r14d, r9d
sub r14d, r9d
cmp r12w, r13w
jg .z_y
cmp r13w, r14w
cmovl r13d, r14d
jmp .insert
.z_y:
cmp r12w, r14w
mov r13d, r14d
cmovge r13d, r12d
.insert:
mov al, r13b
ror rax, 8
mov al, r13b
ror rax, 8
mov al, r13b
ror rax, 8
mov al, 255
ror rax, 8
ret
所以现在我知道我的手工制作的汇编程序在某些情况下几乎比gcc -O3 / -O2慢两倍但是对于我的生活我不明白为什么。这些电话会影响性能吗?还是轮换?使用shift而不是旋转编写了一个实现,但它比这个慢一点。任何帮助将不胜感激。