使用%r8,%r9等寄存器时,有哪些功能?

时间:2015-05-28 20:10:02

标签: assembly att

我想在Assembly中重写这个C函数的代码:

int digits_amount = digits_num / DIGITS_PER_ITERATION + 1;
const long unsigned int array_length = DIGITS_PER_ITERATION*10*(digits_amount) / 3;
long unsigned int remainders[array_length];  
long unsigned int answ[digits_amount];
int i = 0; 
for (; i < array_length; i++)
    remainders[i] = ARRINIT;
for (i = 0; i < digits_amount; i++) {
    int j = array_length-1;
    long unsigned int quotient = 0;
    long unsigned int sum = 0;
    for (; j >= 0; j--) {
        remainders[j] *= SCALE;
        sum = remainders[j]+quotient*(j+1);
        quotient = sum / (2*j+1);
        remainders[j] = sum - quotient*(2*j+1);
    }
    quotient = sum / SCALE;
    remainders[0] = sum - quotient*SCALE;
    answ[i] = quotient;
} //end of algo, then output

其中SCALE = 1000000000,DIGITS_PER_ITERATION = 9,ARRINIT = 200000000,OUTPUT_FORMAT =&#34;%。9lu&#34;。 所以我试图通过内联汇编来做到这一点:

    asm (
    ".L1:\n\t"
        "movq $200000000, (%%rax, %%rcx, 8)\n\t"
        "incq %%rcx\n\t"
    "cmpq %%rdx, %%rcx\n\t"
    "jb .L1\n\t"
    "xorq %%rcx, %%rcx\n\t"
    ".L2:\n\t"
        "pushq %%rbx\n\t"/*for free usage of rbx register*/
        "movq %%rdx, %%r8\n\t"/*j*/
        "decq %%r8\n\t"
        "movq $0, %%r9\n\t"/*quotient*/
        "movq $0, %%r10\n\t"/*sum*/
        ".L3:\n\t" /*inner cycle*/
            "movq (%%rax, %%r8, 8), %%r11\n\t"/*remainders[j]*/
            "pushq %%rax\n\t"
            "movq $1000000000, %%rax\n\t"
            "mulq %%r11\n\t"/*remainders[j]*SCALE*/
            "movq %%rax, %%r11\n\t"
            "movq %%r8, %%rax\n\t"
            "incq %%rax\n\t"
            "mulq %%r9\n\t" /*quotient*(j+1)*/
            "addq %%rax, %%r10\n\t"
            "addq %%r11, %%r10\n\t" /*sum = remainders[j]+quotient*(j+1)*/
            "movq %%r10, %%rax\n\t"
            "movq %%r8, %%rbx\n\t"
            "addq %%rbx, %%rbx\n\t"
            "incq %%rbx\n\t"
            "pushq %%rdx\n\t"
            "xorq %%rdx, %%rdx\n\t"
            "divq %%rbx\n\t" /*sum / (2*j+1)
            "movq %%rax, %%r9\n\t" /*quotient*/
            "movq %%rdx, %%rbx\n\t"/*remainder now in %rbx*/
            "popq %%rdx\n\t"
            "popq %%rax\n\t"
            "movq %%rbx, (%%rax, %%r8, 8)\n\t" /*remainders[j] = remainder*/
            "decq %%r8\n\t"
        "cmpq $0, %%r8\n\t"
        "jge .L3\n\t"
    "movq $1000000000, %%rbx\n\t"
    "pushq %%rdx\n\t"
    "pushq %%rax\n\t"
    "movq %%r10, %%rax\n\t"
    "xorq %%rdx, %%rdx\n\t"
    "divq %%rbx\n\t"
    "movq %%rax, %%r9\n\t"
    "popq %%rax\n\t"
    "movq %%rdx, (%%rax)\n\t"
    "popq %%rdx\n\t"
    "popq %%rbx\n\t"/*get the address of answ array*/
    "movq %%r12, (%%rbx, %%rcx, 8)\n\t"  /*answ[i] = */ 
    "incq %%rcx\n\t" 
    "cmpq %%rdi, %%rcx\n\t"
    "jb .L2\n\t"
    :"=c"(i)
    :"a"(remainders),"c"(i), "d"(array_length), "D"(digits_amount), "b"(answ)/* inputs */
    :/* clobbered */
);

在生成的代码中,它看起来像这样:

.L1:
movq $200000000, (%rax, %rcx, 8)
incq %rcx
cmpq %rdx, %rcx
jb .L1
xorq %rcx, %rcx
.L2:
pushq %rbx
movq %rdx, %r8
decq %r8
movq $0, %r9
movq $0, %r10
.L3:
movq 0(%rax, %r8, 8), %r11
pushq %rax
movq $1000000000, %rax
mulq %r11
movq %rax, %r11
movq %r8, %rax
incq %rax
mulq %r9
addq %rax, %r10
addq %r11, %r10
movq %r10, %rax
movq %r8, %rbx
addq %rbx, %rbx
incq %rbx
pushq %rdx
xorq %rdx, %rdx
divq %rbx
movq %rax, %r9
movq %rdx, %rbx
popq %rdx
popq %rax
movq %rbx, 0(%rax, %r8, 8)
decq %r8
cmpq $0, %r8
jge .L3
movq $1000000000, %rbx
pushq %rdx
pushq %rax
movq %r10, %rax
xorq %rdx, %rdx
divq %rbx
movq %rax, %r9
popq %rax
movq %rdx, (%rax)
popq %rdx
popq %rbx
movq %r12, (%rbx, %rcx, 8)
incq %rcx
cmpq %rdi, %rcx
jb .L2

所以,答案不匹配。我检查了内循环中的值并提到,第一个值匹配,但不是其余值。我认为,这个问题可以是mul或div,但我无法找到它。 div不仅可以改变%rdx和%rax吗?在我不知情的情况下,%r8,%r9,...寄存器是否有可能发生变化?也许我在某个地方弄错了。这是一个很好的方法来使用这么多寄存器吗?这必须比使用内存更快,所以我试着这样做。

0 个答案:

没有答案