Question

我为大整数编码了一个非常好的整数lib，但限制为512位（由于各种原因，比GMP快）。我试图推广大尺寸的lib。所以我必须循环遍历adcq指令。

// long addition little indian order due the technique incq-jnz
// I can not use compare because it destroy the Carry Bit
template<int n>
void test_add(boost::uint64_t*, boost::uint64_t* ){    
    asm volatile (
        "clc                                     \n"
        "movq %0, %%rcx                          \n"
    "loop:                                       \n"
        "movq 8(%%rsi,%%rcx,8), %%rax            \n"  /* original -8(%%rsi,%%rbx,8) */
        "adcq %%rax           , 8(%%rdi,%%rcx,8) \n"  /* original -8(%%rsi,%%rbx,8) */
        "incq %%rcx                              \n"  /* original decq */
    "jnz loop                                    \n"
        :   
        :"g"(n)
        :"rax","rcx","cc","memory"
    );  
}


int main(int argc, char* argv[]) {
boost::uint64_t c[4],d[4];

c[0] = -1; 
c[1] = -1; 
c[2] = -1; 
c[3] =  0;  

d[0] = 1;
d[1] = 0;
d[2] = 0;
d[3] = 0;

test_add<-4>(&d[3],&c[3]); // <-- BigEndian to LittleEndian

这种方法在调试模式-O0中运行良好，但只要我使用优化，segfault /

我真的不明白，因为我尊重rsi for rsi和rdi，clobber寄存器，使用好的寄存器，所以我编译了GCC -O0 -S和-O2 -S

对于-O0 -S我得到了

 3 .globl main
 4         .type   main, @function
 5 main:
 6 .LFB1:
 7         .cfi_startproc
 8         .cfi_personality 0x3,__gxx_personality_v0
 9         pushq   %rbp
 10         .cfi_def_cfa_offset 16
 11         .cfi_offset 6, -16
 12         movq    %rsp, %rbp
 13         .cfi_def_cfa_register 6
 14         subq    $80, %rsp
 15         movl    %edi, -68(%rbp)
 16         movq    %rsi, -80(%rbp)
 17         movq    $-1, -32(%rbp)
 18         movq    $-1, -24(%rbp)
 19         movq    $-1, -16(%rbp)
 20         movq    $0, -8(%rbp)
 21         movq    $1, -64(%rbp)
 22         movq    $0, -56(%rbp)
 23         movq    $0, -48(%rbp)
 24         movq    $0, -40(%rbp)
 25         leaq    -32(%rbp), %rax
 26         leaq    24(%rax), %rdx
 27         leaq    -64(%rbp), %rax
 28         addq    $24, %rax
 29         movq    %rdx, %rsi
 30         movq    %rax, %rdi
 31         call    _Z8test_addILin4EEvPyS0_
 32         movl    $0, %eax
 33         leave
 34         .cfi_def_cfa 7, 8
 35         ret
 36         .cfi_endproc
 37 .LFE1:
 38         .size   main, .-main
 39         .section              .   enter code here  `enter code here`text._Z8test_addILin4EEvPyS0_,"axG",@progbits,_Z8test_addILin4EEvPyS0_,comdat
 40         .weak   _Z8test_addILin4EEvPyS0_
 41         .type   _Z8test_addILin4EEvPyS0_, @function
 42 _Z8test_addILin4EEvPyS0_:
 43 .LFB2:
 44         .cfi_startproc
 45         .cfi_personality 0x3,__gxx_personality_v0
 46         pushq   %rbp
 47         .cfi_def_cfa_offset 16
 48         .cfi_offset 6, -16
 49         movq    %rsp, %rbp
 50         .cfi_def_cfa_register 6
 51         movq    %rdi, -8(%rbp)
 52         movq    %rsi, -16(%rbp)
 53 #APP
 54 # 14 "test.cpp" 1
 55         clc
 56 movq $-4, %rcx
 57 loop:
 58 movq 8(%rsi,%rcx,8), %rax
 59 adcq %rax           , 8(%rdi,%rcx,8)
 60 incq %rcx
 61 jnz loop
 62 
 63 # 0 "" 2
 64 #NO_APP
 65         leave
 66         .cfi_def_cfa 7, 8
 67         ret
 68         .cfi_endproc
 69 .LFE2:
 70         .size   _Z8test_addILin4EEvPyS0_, .-_Z8test_addILin4EEvPyS0_
 71         .ident  "GCC: (GNU) 4.4.6 20120305 (Red Hat 4.4.6-4)"
 72         .section        .note.GNU-stack,"",@progbits

第20-30行我们看到编译器重新组织堆栈以将arg传递给rsi和rdi （第29-30行）和电话。完美如ABI

如果现在我看到我得到的优化版本

  1         .file   "test.cpp"
  2         .text
  3         .p2align 4,,15
  4 .globl main
  5         .type   main, @function
  6 main:
  7 .LFB1:
  8         .cfi_startproc
  9         .cfi_personality 0x3,__gxx_personality_v0
  10 #APP
  11 # 14 "test.cpp" 1
  12         clc
  13 movq $-4, %rcx
  14 loop:
  15 movq 8(%rsi,%rcx,8), %rax
  16 adcq %rax           , 8(%rdi,%rcx,8)
  17 incq %rcx
  18 jnz loop
  19 
  20 # 0 "" 2
  21 #NO_APP
  22         xorl    %eax, %eax
  23         ret
  24         .cfi_endproc
  25 .LFE1:
  26         .size   main, .-main
  27         .ident  "GCC: (GNU) 4.4.6 20120305 (Red Hat 4.4.6-4)"
  28         .section        .note.GNU-stack,"",@progbits

再见ABI再见，我不明白。堆栈由什么????

管理

ASM大师有个主意吗？我拒绝把这个函数放到一个独立的文件中，很好的元编程精神。

干杯。

-------编辑：

如果我把它放到一个循环中，我发现你的解决方案有一个错误：

#include <boost/cstdint.hpp> //boost type

template<long n>
void test_add(boost::uint64_t* x, boost::uint64_t const* y) {
    boost::uint64_t dummy;
    boost::uint64_t loop_index(n);
    __asm__ __volatile__ (
        "clc\n\t"
        "1:\n\t"
        "movq (%[y],%[counter],8), %[dummy]\n\t"
        "adcq %[dummy], (%[x], %[counter], 8)\n\t"
        "incq %[counter]\n\t"
        "jnz 1b\n\t"
        : [dummy] "=&r" (dummy)
        : [x] "r" (x), [y] "r" (y), [counter] "r" (loop_index)
        : "memory", "cc");
 }


int main(int argc, char* argv[]) {
    boost::uint64_t c[3],d[3];

    c[0] = -1; 
    c[1] = -1; 
    c[2] = -1; 
    c[3] =  0;  

    d[0] = 1;
    d[1] = 0;
    d[2] = 0;
    d[3] = 0;

for(int i=0; i < 0xfff; ++i)
    test_add<-4>(&c[4],&d[4]);

 return 0;

}

将提供以下ASM：

      movq    $-4, %rdx <---------------------template parameter
      leaq    -32(%rsp), %rcx
      movq    $-1, -32(%rsp)
      movq    $-1, -24(%rsp)
      movq    $-1, -16(%rsp)
      movq    $0, -8(%rsp)
      movq    $1, -64(%rsp)
      movq    $0, -56(%rsp)
      movq    $0, -48(%rsp)
      movq    $0, -40(%rsp)
      .p2align 4,,10
      .p2align 3
  .L2: <-------- OUPUT loop
#APP
# 16 "main.cpp" 1
       clc
       1: <-------- INPUT loop
       movq (%rcx,%rdx,8), %rsi
       adcq %rsi, (%rsp, %rdx, 8)
       incq %rdx <------------ rdx++ -> (-4)++ (for the @nd iteration of L2 it is not reset to -4)
       jnz 1b

 # 0 "" 2
 #NO_APP
       addl    $1, %eax
       cmpl    $4095, %eax <----- test second loop
       jne     .L2

对于输出循环的第二次迭代，rdx不会重新注入到-4，因此movq指令会给出错误的读取，即segfault。我修补得非常糟糕（我用手重置-4），我只是在jnz之后添加“movq $ -4，％[counter] \ n \ t”，但我需要更通用的东西。是否存在将计数器重置为模板参数值的约束？

目前的修正是：

template<long n>
void test_add(boost::uint64_t* x, boost::uint64_t const* y) {
    boost::uint64_t dummy;
    __asm__ __volatile__ (
        "clc\n\t"
        "movq %[counter_value], %[counter]\n\t" // set the counter to the template value, it's not sure if the function is reused
        "1:\n\t"
        "movq (%[y],%[counter],8), %[dummy]\n\t"
        "adcq %[dummy], (%[x], %[counter], 8)\n\t"
        "incq %[counter]\n\t"
        "jnz 1b\n\t"
        : [dummy] "=&r" (dummy)
        : [x] "r" (x), [y] "r" (y), [counter] "r" (n), [counter_value] "i" (n)
        : "memory", "cc");
}

Answer 1

您应该使用约束来访问参数。对于内部函数，gcc不需要遵循ABI，即使这样做，也不需要在执行asm块时保持初始状态不变。当然，内联asm的要点是让编译器内联它，然后甚至不会发生函数调用。（许多人错误地认为内联意味着“嵌入在C源文件中”并将其用作便利功能，即使不需要实际的代码内联也是如此。）

gcc也能够将内容放入你想要的寄存器中（不是你特别关心这里的rcx计数器）。尽可能多地留给编译器也是一个好主意，这样它就可以进行寄存器分配，循环展开和其他优化。很遗憾，我无法生成gcc来生成ADC，因此这次停止了asm块。由于部分标志更新，建议不要使用inc，但我现在看不到明显的方法。

最后，如果您传递d[3]的地址，则会通过d[-1]访问项目d[2]，这不是您想要的。您应该通过d[4]。

固定版本看起来像这样（带有命名参数）：

template<long n>
void test_add(boost::uint64_t* x, boost::uint64_t* y) {
    boost::uint64_t dummy, dummy2;
    __asm__ __volatile__ (
        "clc\n\t"
        "1:\n\t"
        "movq (%[y], %[counter], 8), %[dummy]\n\t"
        "adcq %[dummy], (%[x], %[counter], 8)\n\t"
        "incq %[counter]\n\t"
        "jnz 1b\n\t"
        : [dummy] "=&r" (dummy), "=r" (dummy2)
        : [x] "r" (x), [y] "r" (y), [counter] "1" (n)
        : "memory", "cc");
}

请注意，dummy变量将被优化掉，同时允许gcc选择合适的寄存器，而不是强制它使用特定的寄存器。

更新：这是一个纯C ++版本，编译器可以完全展开并优化（包括在编译时计算内容！）。虽然在通用情况下编译器的代码不如手写代码那么高效，但所提到的优化可能会使其在环境下更好。注意：由于您使用的是gcc内联asm，这意味着您的代码已经gcc且x86-64具体，因此使用__uint128_t不是进一步的限制（实际上这将是在gcc支持128位整数的任何架构上工作。

template<long n>
void test_add(boost::uint64_t* x, boost::uint64_t* y) {
    __uint128_t work = 0;
    for(long i = n; i < 0; i += 1) {
        work = work + x[i] + y[i];
        x[i] = work; // automatic truncation
        work >>= 64;
    }
}

ASM阻塞功能和ABI x86-64

1 个答案: