试图优化装配中的子程序

时间:2015-10-31 19:39:42

标签: c gcc assembly optimization disassembly

所以我正在尝试用纯粹的装配来优化这个C函数,该函数需要2个图像并逐像素地输出它在灰度级中的最大差异:

unsigned short infNorm(unsigned char x1, unsigned char y1, unsigned char z1, unsigned char x2, unsigned char y2, unsigned char z2 ){
    short x = abs(x1-x2);
    short y = abs(y1-y2);
    short z = abs(z1-z2);
    if(x <= y){
        return y > z ? y : z;
    }else{
        return x > z ? x : z;
    }
}

void diff_c (unsigned char *src, unsigned char *src_2, unsigned char *dst, int m, int n, int src_row_size, int src_2_row_size, int dst_row_size) {
    unsigned char (*src_matrix)[src_row_size]       = (unsigned char (*)[src_row_size]) src;
    unsigned char (*src_2_matrix)[src_2_row_size]   = (unsigned char (*)[src_2_row_size]) src_2;
    unsigned char (*dst_matrix)[dst_row_size]       = (unsigned char (*)[dst_row_size]) dst;

    unsigned char n1;
    for(int y = 0; y < n; y++){
        for(int x = 0; x < m; ++x ){
            n1 = infNorm(src_matrix[y][x*4], src_matrix[y][x*4+1], src_matrix[y][x*4+2],
                        src_2_matrix[y][x*4], src_2_matrix[y][x*4+1], src_2_matrix[y][x*4+2]);

            dst_matrix[y][x*4]   = n1;                      //blue
            dst_matrix[y][x*4+1] = n1;                      //red
            dst_matrix[y][x*4+2] = n1;                      //green
            dst_matrix[y][x*4+3] = 255;                     //alpha
        }
    }
}

我使用SIMD做了这个,并且得到的函数比任何优化标志上gcc的任何输出都要快得多。现在,我想做同样的事情,但不使用任何SSE指令。对于这个我objdumped gcc的构建并隔离了执行这两个函数的部分(gcc -O3),以便对gcc做的事情有所了解:

0000000000402280 <infNorm>:
402280: 40 0f b6 ff           movzx  edi,dil
402284: 0f b6 c9              movzx  ecx,cl
402287: 40 0f b6 f6           movzx  esi,sil
40228b: 29 cf                 sub    edi,ecx
40228d: 45 0f b6 c0           movzx  r8d,r8b
402291: 0f b6 d2              movzx  edx,dl
402294: 89 f8                 mov    eax,edi
402296: 44 29 c6              sub    esi,r8d
402299: 45 0f b6 c9           movzx  r9d,r9b
40229d: c1 f8 1f              sar    eax,0x1f
4022a0: 44 29 ca              sub    edx,r9d
4022a3: 31 c7                 xor    edi,eax
4022a5: 29 c7                 sub    edi,eax
4022a7: 89 f0                 mov    eax,esi
4022a9: c1 f8 1f              sar    eax,0x1f
4022ac: 31 c6                 xor    esi,eax
4022ae: 29 c6                 sub    esi,eax
4022b0: 89 d0                 mov    eax,edx
4022b2: c1 f8 1f              sar    eax,0x1f
4022b5: 31 c2                 xor    edx,eax
4022b7: 29 c2                 sub    edx,eax
4022b9: 66 39 f7              cmp    di,si
4022bc: 7e 12                 jle    4022d0 <infNorm+0x50>
4022be: 66 39 d7              cmp    di,dx
4022c1: 89 d0                 mov    eax,edx
4022c3: 0f 4d c7              cmovge eax,edi
4022c6: c3                    ret    
4022c7: 66 0f 1f 84 00 00 00  nop    WORD PTR [rax+rax*1+0x0]
4022ce: 00 00 
4022d0: 66 39 d6              cmp    si,dx
4022d3: 89 d0                 mov    eax,edx
4022d5: 0f 4d c6              cmovge eax,esi
4022d8: c3                    ret    
4022d9: 0f 1f 80 00 00 00 00  nop    DWORD PTR [rax+0x0]

00000000004022e0 <diff_c>:
4022e0: 45 85 c0              test   r8d,r8d
4022e3: 0f 8e 05 01 00 00     jle    4023ee <diff_c+0x10e>
4022e9: 41 57                 push   r15
4022eb: 45 31 ff              xor    r15d,r15d
4022ee: 41 56                 push   r14
4022f0: 44 8d 34 8d 00 00 00  lea    r14d,[rcx*4+0x0]
4022f7: 00 
4022f8: 41 55                 push   r13
4022fa: 41 54                 push   r12
4022fc: 49 89 f4              mov    r12,rsi
4022ff: 55                    push   rbp
402300: 48 89 fd              mov    rbp,rdi
402303: 53                    push   rbx
402304: 48 63 44 24 38        movsxd rax,DWORD PTR [rsp+0x38]
402309: 48 89 44 24 e8        mov    QWORD PTR [rsp-0x18],rax
40230e: 49 63 c1              movsxd rax,r9d
402311: 48 89 44 24 f8        mov    QWORD PTR [rsp-0x8],rax
402316: 48 63 44 24 40        movsxd rax,DWORD PTR [rsp+0x40]
40231b: 48 89 44 24 f0        mov    QWORD PTR [rsp-0x10],rax
402320: 85 c9                 test   ecx,ecx
402322: 0f 8e a0 00 00 00     jle    4023c8 <diff_c+0xe8>
402328: 31 c0                 xor    eax,eax
40232a: 66 0f 1f 44 00 00     nop    WORD PTR [rax+rax*1+0x0]
402330: 48 63 d8              movsxd rbx,eax
402333: 44 8d 58 01           lea    r11d,[rax+0x1]
402337: 44 8d 50 02           lea    r10d,[rax+0x2]
40233b: 41 0f b6 34 1c        movzx  esi,BYTE PTR [r12+rbx*1]
402340: 44 0f b6 4c 1d 00     movzx  r9d,BYTE PTR [rbp+rbx*1+0x0]
402346: 4d 63 db              movsxd r11,r11d
402349: 4d 63 d2              movsxd r10,r10d
40234c: 42 0f b6 7c 1d 00     movzx  edi,BYTE PTR [rbp+r11*1+0x0]
402352: 47 0f b6 2c 14        movzx  r13d,BYTE PTR [r12+r10*1]
402357: 41 29 f1              sub    r9d,esi
40235a: 44 89 ce              mov    esi,r9d
40235d: c1 fe 1f              sar    esi,0x1f
402360: 41 31 f1              xor    r9d,esi
402363: 41 29 f1              sub    r9d,esi
402366: 43 0f b6 34 1c        movzx  esi,BYTE PTR [r12+r11*1]
40236b: 29 f7                 sub    edi,esi
40236d: 89 fe                 mov    esi,edi
40236f: c1 fe 1f              sar    esi,0x1f
402372: 31 f7                 xor    edi,esi
402374: 29 f7                 sub    edi,esi
402376: 42 0f b6 74 15 00     movzx  esi,BYTE PTR [rbp+r10*1+0x0]
40237c: 44 29 ee              sub    esi,r13d
40237f: 41 89 f5              mov    r13d,esi
402382: 41 c1 fd 1f           sar    r13d,0x1f
402386: 44 31 ee              xor    esi,r13d
402389: 44 29 ee              sub    esi,r13d
40238c: 41 89 fd              mov    r13d,edi
40238f: 66 39 fe              cmp    si,di
402392: 44 0f 4d ee           cmovge r13d,esi
402396: 66 44 39 ce           cmp    si,r9w
40239a: 41 0f 4c f1           cmovl  esi,r9d
40239e: 66 41 39 f9           cmp    r9w,di
4023a2: 41 0f 4e f5           cmovle esi,r13d
4023a6: 40 88 34 1a           mov    BYTE PTR [rdx+rbx*1],sil
4023aa: 42 88 34 1a           mov    BYTE PTR [rdx+r11*1],sil
4023ae: 42 88 34 12           mov    BYTE PTR [rdx+r10*1],sil
4023b2: 8d 70 03              lea    esi,[rax+0x3]
4023b5: 83 c0 04              add    eax,0x4
4023b8: 44 39 f0              cmp    eax,r14d
4023bb: 48 63 f6              movsxd rsi,esi
4023be: c6 04 32 ff           mov    BYTE PTR [rdx+rsi*1],0xff
4023c2: 0f 85 68 ff ff ff     jne    402330 <diff_c+0x50>
4023c8: 41 83 c7 01           add    r15d,0x1
4023cc: 4c 03 64 24 e8        add    r12,QWORD PTR [rsp-0x18]
4023d1: 48 03 6c 24 f8        add    rbp,QWORD PTR [rsp-0x8]
4023d6: 48 03 54 24 f0        add    rdx,QWORD PTR [rsp-0x10]
4023db: 45 39 c7              cmp    r15d,r8d
4023de: 0f 85 3c ff ff ff     jne    402320 <diff_c+0x40>
4023e4: 5b                    pop    rbx
4023e5: 5d                    pop    rbp
4023e6: 41 5c                 pop    r12
4023e8: 41 5d                 pop    r13
4023ea: 41 5e                 pop    r14
4023ec: 41 5f                 pop    r15
4023ee: f3 c3                 repz ret 

所以考虑到这一点,与gcc正在做的事情相反的想法是尽可能避免访问内存并使用类似于gcc的东西我将循环展开了一次并编写了这个:

diff_asm:
    push rbp
    push r12
    push r13
    push r14
    push r15
    push rbx
    sub rsp, 8

    mov r15, rdx
    mov eax, r8d
    mov ecx, ecx
    mul rcx
    mov rcx, rax
    mov rdx, r15

    .cicle:
    cmp rcx, 0
    je .tend
    mov rbx, [rdi + rcx*4 - 8]
    movzx r12, bl
    shr rbx, 8
    movzx r13, bl
    shr rbx, 8
    movzx r14, bl
    shr rbx, 16
    mov r15, [rsi + rcx*4 - 8]
    movzx rbp, r15b
    shr r15, 8
    movzx r8, r15b
    shr r15, 8
    movzx r9, r15b
    shr r15, 16

    call inf_norm

    movzx r12, bl
    shr rbx, 8
    movzx r13, bl
    shr rbx, 8
    movzx r14, bl
    shr rbx, 8
    movzx rbp, r15b
    shr r15, 8
    movzx r8, r15b
    shr r15, 8
    movzx r9, r15b
    shr r15, 8

    call inf_norm 

    mov [rdx + rcx*4 - 8], rax

    sub rcx, 2
    jmp .cicle

    .tend:

    add rsp, 8
    pop rbx
    pop r15
    pop r14
    pop r13
    pop r12
    pop rbp
    ret

; r12w = x_1
; r13w = y_1
; r14w = z_1
; rbp  = x_2
; r8   = y_2
; r9   = z_3
inf_norm:
    sub r12d, ebp
    mov ebp, r12d
    sar ebp, 0x1f
    xor r12d, ebp
    sub r12d, ebp

    sub r13d, r8d
    mov r8d, r13d
    sar r8d, 0x1f
    xor r13d, r8d
    sub r13d, r8d

    sub r14d, r9d
    mov r9d, r14d
    sar r9d, 0x1f
    xor r14d, r9d
    sub r14d, r9d

    cmp r12w, r13w
    jg .z_y
    cmp r13w, r14w
    cmovl r13d, r14d
    jmp .insert

    .z_y:
    cmp r12w, r14w
    mov r13d, r14d
    cmovge r13d, r12d

    .insert:

    mov al, r13b
    ror rax, 8
    mov al, r13b
    ror rax, 8
    mov al, r13b
    ror rax, 8
    mov al, 255       
    ror rax, 8
    ret

所以现在我知道我的手工制作的汇编程序在某些情况下几乎比gcc -O3 / -O2慢两倍但是对于我的生活我不明白为什么。这些电话会影响性能吗?还是轮换?使用shift而不是旋转编写了一个实现,但它比这个慢一点。任何帮助将不胜感激。

0 个答案:

没有答案