你能用C ++内联汇编为我推荐更好的解决方案吗?

时间:2012-07-20 12:49:26

标签: c++ optimization assembly inline-assembly

我正在学习汇编,我开始在Digital-Mars C ++编译器中使用SSE和MMX寄存器进行实验(intel sytanx更易于阅读)。我已经完成了一个程序,它将var_1作为一个值并将其转换为var_2数字系统(现在是8位。稍后会将其扩展为32 64 128)。程序通过两种方式实现:

  1. __asm内联

  2. %(modulo)运算符的常用C ++方式。

  3. 问题:你能告诉我使用xmm0-7和mm0-7寄存器的更有效方法吗?你能告诉我如何用al,ah ... 8位寄存器交换它们的确切字节吗?

    与我计算机上的__asm相比,C ++通常使用的常用%(modulo)运算符非常慢(pentium-m centrino 2.0GHz)。 如果你能告诉我如何摆脱__asmm中的除法指令,它会更快。

    当我运行程序时,它给了我:

    (for the values: var_1=17,var_2=2,all loops are 200M times)
    
    17 is 10001 in number system 2
    __asm(clock)...........: 7250    <------too bad. it is 8-bit calc.
    C++(clock).............: 12250   <------not very slow(var_2 is a power of 2)
    
    
    (for the values: var_1=33,var_2=7,all loops are 200M times)
    33 is 45 in number system 7
     __asm(clock)..........: 2875   <-------not good. it is 8-bit calc.
     C++(clock)............: 6328   <----------------really slow(var_2 is not a power of 2)
    

    第二个C ++代码(带%运算符的代码)://///////////////////////////////// //////////////////////

    t1=clock();//reference time
    for(int i=0;i<200000000;i++)
    {
        y=x;
        counter=0;
        while(y>g)
        {   
    
            var_3[counter]=y%g;
            y/=g;
            counter++;
        }
    
         var_3[counter]=y%g;
    }   
    t2=clock();//final time
    

    _asm code://////////////////////////////////////////// ////////////////////////////////////////////////// //////////////

         __asm  // i love assembly in some parts of C++
            {
    
            pushf   //here does register backup
            push eax
            push ebx
            push ecx
            push edx
            push edi
    
                mov eax,0h      //this will be outer loop counter init to zero
                //init of medium-big registers to zero
                movd xmm0,eax    //cannot set to immediate constant: xmm0=outer loop counter 
                shufps xmm0,xmm0,0h //this makes all bits zero
                movd xmm1,eax
                movd xmm2,eax   
                shufps xmm1,xmm1,0h
                shufps xmm2,xmm2,0h
                movd xmm2,eax 
                shufps xmm3,xmm3,0h//could have made pxor xmm3,xmm3(single instruction)
                //init complete(xmm0,xmm1,xmm2,xmm3 are zero)
    
                movd xmm1,[var_1] //storing variable_1 to register
                movd xmm2,[var_2] //storing var_2 to register    
                lea ebx,var_3     //calculate var_3 address
                movd xmm3,ebx     //storing var_3's address to register
                for_loop:
                mov eax,0h      
                //this line is index-init to zero(digit array index)
                movd edx,xmm2
                mov cl,dl       //this is the var_1 stored in cl
                movd edx,xmm1
                mov al,dl       //this is the var_2 stored in al
                mov edx,0h
                dng:
                    mov ah,00h      //preparation for a 8-bit division
                    div cl          //divide
    
                    movd ebx,xmm3   //get var_3 address
                    add ebx,edx     //i couldnt find a way to multiply with 4
                    add ebx,edx     //so i added 4 times ^^
                    add ebx,edx     //add   
                    add ebx,edx     //last adding
                    //below, mov [ebx],ah is the only memory accessing instruction
                    mov [ebx],ah    //(8 bit)this line is equivalent to var_3[i]=remainder
    
    
                    inc edx         //i++;
                    cmp al,00h      //is division zero?
                jne dng             //if no, loop again
    
                //here edi register has the number of digits
    
                movd eax,xmm0       //get the outer loop counter from medium-big register
                add eax,01h         //j++;
                movd xmm0,eax       //store the new counter to medium-big register
                cmp eax,0BEBC200h           //is j<(200,000,000) ?
                jb for_loop     //if yes, go loop again
                mov [var_3_size],edx //now we have number of digits too!
             //here does registers revert back to old values
            pop edi
            pop edx
            pop ecx
            pop ebx
            pop eax
            popf     
    
            }
    

    整码://////////////////////////////////////////// /////////////////////////////////////////////

    #include <iostream.h>
    #include <cmath>
    #include<stdlib.h>
    #include<stdio.h>
    #include<time.h>
    int main()
        {
    
        srand(time(0));
    
    
        clock_t t1=clock();
        clock_t t2=clock();
    
        int var_1=17;  //number itself
        int var_2=2;   //number system
        int var_3[100];  //digits to be showed(maximum 100 as seen )
        int var_3_size=0;//asm block will decide what will the number of  digits be
    
        for(int i=0;i<100;i++)
        {
        var_3[i]=0; //here we initialize digits to zeroes
        }
    
    
        t1=clock();//reference time to take
         __asm  // i love assembly in some parts of C++
            {
    
            pushf   //here does register backup
            push eax
            push ebx
            push ecx
            push edx
            push edi
    
                mov eax,0h      //this will be outer loop counter init to zero
                //init of medium-big registers to zero
                movd xmm0,eax    //cannot set to immediate constant: xmm0=outer loop counter 
                shufps xmm0,xmm0,0h //this makes all bits zero
                movd xmm1,eax
                movd xmm2,eax   
                shufps xmm1,xmm1,0h
                shufps xmm2,xmm2,0h
                movd xmm2,eax 
                shufps xmm3,xmm3,0h
                //init complete(xmm0,xmm1,xmm2,xmm3 are zero)
    
                movd xmm1,[var_1] //storing variable_1 to register
                movd xmm2,[var_2] //storing var_2 to register    
                lea ebx,var_3     //calculate var_3 address
                movd xmm3,ebx     //storing var_3's address to register
                for_loop:
                mov eax,0h      
                //this line is index-init to zero(digit array index)
                movd edx,xmm2
                mov cl,dl       //this is the var_1 stored in cl
                movd edx,xmm1
                mov al,dl       //this is the var_2 stored in al
                mov edx,0h
                dng:
                    mov ah,00h      //preparation for a 8-bit division
                    div cl          //divide
    
                    movd ebx,xmm3   //get var_3 address
                    add ebx,edx     //i couldnt find a way to multiply with 4
                    add ebx,edx     //so i added 4 times ^^
                    add ebx,edx     //add   
                    add ebx,edx     //last adding
                    //below, mov [ebx],ah is the only memory accessing instruction
                    mov [ebx],ah    //(8 bit)this line is equivalent to var_3[i]=remainder
    
    
                    inc edx         //i++;
                    cmp al,00h      //is division zero?
                jne dng             //if no, loop again
    
                //here edi register has the number of digits
    
                movd eax,xmm0       //get the outer loop counter from medium-big register
                add eax,01h         //j++;
                movd xmm0,eax       //store the new counter to medium-big register
                cmp eax,0BEBC200h           //is j<(200,000,000) ?
                jb for_loop     //if yes, go loop again
                mov [var_3_size],edx //now we have number of digits too!
             //here does registers revert back to old values
            pop edi
            pop edx
            pop ecx
            pop ebx
            pop eax
            popf     
    
            }
        t2=clock(); //finish time
        printf("\n assembly_inline(clocks): %i  for the 200 million calculations",(t2-t1)); 
    
            printf("\n value %i(in decimal) is: ",var_1);
    for(int i=var_3_size-1;i>=0;i--)
    {
        printf("%i",var_3[i]);
    }
    printf(" in the number system: %i \n",var_2);
    
    
    
    
    //and: more readable form(end easier)
        int counter=var_3_size;
        int x=var_1;
        int g=var_2;
        int y=x;// backup
    t1=clock();//reference time
    
    for(int i=0;i<200000000;i++)
    {
        y=x;
        counter=0;
        while(y>g)
        {   
    
            var_3[counter]=y%g;
            y/=g;
            counter++;
        }
    
         var_3[counter]=y%g;
    }
    
    t2=clock();//final time
    printf("\n C++(clocks): %i  for the 200 million calculations",(t2-t1)); 
    
    printf("\n value %i(in decimal) is: ",x);
    for(int i=var_3_size-1;i>=0;i--)
    {
        printf("%i",var_3[i]);
    }
    printf(" in the number system: %i \n",g);
    return 0;
    

    }

    编辑: 这是32位版本

        void get_digits_asm()
    {
        __asm
        {
    
            pushf       //couldnt store this in other registers 
            movd xmm0,eax//storing in xmm registers instead of pushing
            movd xmm1,ebx//
            movd xmm2,ecx//
            movd xmm3,edx//
            movd xmm4,edi//end of push backups
    
            mov eax,[variable_x]
            mov ebx,[number_system]
            mov ecx,0h
            mov edi,0h
    
            begin_loop:
            mov edx,0h
            div ebx             
            lea edi,digits  
            mov [edi+ecx*4],edx
            add ecx,01h
            cmp eax,ebx
            ja begin_loop
    
            mov edx,0
            div ebx
            lea edi,digits
            mov [edi+ecx*4],edx
            inc ecx
            mov [digits_total],ecx
    
    
            movd edi,xmm4//pop edi
            movd edx,xmm3//pop edx
            movd ecx,xmm2//pop ecx
            movd ebx,xmm1//pop ebx
            movd eax,xmm0//pop eax
            popf            
        }
    
    }
    

1 个答案:

答案 0 :(得分:1)

当然,代码可以简单得多:(以C ++版本为模型,不包括推送和弹出,未经测试)

  mov esi,200000000
_bigloop:
  mov eax,[y]
  mov ebx,[g]
  lea edi,var_3
  ; eax = y
  ; ebx = g
  ; edi = var_3
  xor ecx,ecx
  ; ecx = counter
_loop:
  xor edx,edx
  div ebx
  mov [edi+ecx*4],edx
  add ecx,1
  test eax,eax
  jnz _loop
  sub esi,1
  jnz _bigloop

但如果它比C ++版本更快,我会感到惊讶,事实上,如果基数是2的幂,它几乎肯定会更慢 - 所有理智的编译器都知道如何转换除法和/或模数两个幂的位移和位按和。


这是一个使用ab 8位除法的版本。类似的警告也适用,但现在划分甚至可能溢出(如果y / g超过255)。

  mov esi,200000000
_bigloop:
  mov eax,[y]
  mov ebx,[g]
  lea edi,var_3
  ; eax = y
  ; ebx = g
  ; edi = var_3
  xor ecx,ecx
  ; ecx = counter
_loop:
  div bl
  mov [edi+ecx],ah
  add ecx,1
  and eax,0xFF
  jnz _loop
  sub esi,1
  jnz _bigloop