我想在Assembly中重写这个C函数的代码:
int digits_amount = digits_num / DIGITS_PER_ITERATION + 1;
const long unsigned int array_length = DIGITS_PER_ITERATION*10*(digits_amount) / 3;
long unsigned int remainders[array_length];
long unsigned int answ[digits_amount];
int i = 0;
for (; i < array_length; i++)
remainders[i] = ARRINIT;
for (i = 0; i < digits_amount; i++) {
int j = array_length-1;
long unsigned int quotient = 0;
long unsigned int sum = 0;
for (; j >= 0; j--) {
remainders[j] *= SCALE;
sum = remainders[j]+quotient*(j+1);
quotient = sum / (2*j+1);
remainders[j] = sum - quotient*(2*j+1);
}
quotient = sum / SCALE;
remainders[0] = sum - quotient*SCALE;
answ[i] = quotient;
} //end of algo, then output
其中SCALE = 1000000000,DIGITS_PER_ITERATION = 9,ARRINIT = 200000000,OUTPUT_FORMAT =&#34;%。9lu&#34;。 所以我试图通过内联汇编来做到这一点:
asm (
".L1:\n\t"
"movq $200000000, (%%rax, %%rcx, 8)\n\t"
"incq %%rcx\n\t"
"cmpq %%rdx, %%rcx\n\t"
"jb .L1\n\t"
"xorq %%rcx, %%rcx\n\t"
".L2:\n\t"
"pushq %%rbx\n\t"/*for free usage of rbx register*/
"movq %%rdx, %%r8\n\t"/*j*/
"decq %%r8\n\t"
"movq $0, %%r9\n\t"/*quotient*/
"movq $0, %%r10\n\t"/*sum*/
".L3:\n\t" /*inner cycle*/
"movq (%%rax, %%r8, 8), %%r11\n\t"/*remainders[j]*/
"pushq %%rax\n\t"
"movq $1000000000, %%rax\n\t"
"mulq %%r11\n\t"/*remainders[j]*SCALE*/
"movq %%rax, %%r11\n\t"
"movq %%r8, %%rax\n\t"
"incq %%rax\n\t"
"mulq %%r9\n\t" /*quotient*(j+1)*/
"addq %%rax, %%r10\n\t"
"addq %%r11, %%r10\n\t" /*sum = remainders[j]+quotient*(j+1)*/
"movq %%r10, %%rax\n\t"
"movq %%r8, %%rbx\n\t"
"addq %%rbx, %%rbx\n\t"
"incq %%rbx\n\t"
"pushq %%rdx\n\t"
"xorq %%rdx, %%rdx\n\t"
"divq %%rbx\n\t" /*sum / (2*j+1)
"movq %%rax, %%r9\n\t" /*quotient*/
"movq %%rdx, %%rbx\n\t"/*remainder now in %rbx*/
"popq %%rdx\n\t"
"popq %%rax\n\t"
"movq %%rbx, (%%rax, %%r8, 8)\n\t" /*remainders[j] = remainder*/
"decq %%r8\n\t"
"cmpq $0, %%r8\n\t"
"jge .L3\n\t"
"movq $1000000000, %%rbx\n\t"
"pushq %%rdx\n\t"
"pushq %%rax\n\t"
"movq %%r10, %%rax\n\t"
"xorq %%rdx, %%rdx\n\t"
"divq %%rbx\n\t"
"movq %%rax, %%r9\n\t"
"popq %%rax\n\t"
"movq %%rdx, (%%rax)\n\t"
"popq %%rdx\n\t"
"popq %%rbx\n\t"/*get the address of answ array*/
"movq %%r12, (%%rbx, %%rcx, 8)\n\t" /*answ[i] = */
"incq %%rcx\n\t"
"cmpq %%rdi, %%rcx\n\t"
"jb .L2\n\t"
:"=c"(i)
:"a"(remainders),"c"(i), "d"(array_length), "D"(digits_amount), "b"(answ)/* inputs */
:/* clobbered */
);
在生成的代码中,它看起来像这样:
.L1:
movq $200000000, (%rax, %rcx, 8)
incq %rcx
cmpq %rdx, %rcx
jb .L1
xorq %rcx, %rcx
.L2:
pushq %rbx
movq %rdx, %r8
decq %r8
movq $0, %r9
movq $0, %r10
.L3:
movq 0(%rax, %r8, 8), %r11
pushq %rax
movq $1000000000, %rax
mulq %r11
movq %rax, %r11
movq %r8, %rax
incq %rax
mulq %r9
addq %rax, %r10
addq %r11, %r10
movq %r10, %rax
movq %r8, %rbx
addq %rbx, %rbx
incq %rbx
pushq %rdx
xorq %rdx, %rdx
divq %rbx
movq %rax, %r9
movq %rdx, %rbx
popq %rdx
popq %rax
movq %rbx, 0(%rax, %r8, 8)
decq %r8
cmpq $0, %r8
jge .L3
movq $1000000000, %rbx
pushq %rdx
pushq %rax
movq %r10, %rax
xorq %rdx, %rdx
divq %rbx
movq %rax, %r9
popq %rax
movq %rdx, (%rax)
popq %rdx
popq %rbx
movq %r12, (%rbx, %rcx, 8)
incq %rcx
cmpq %rdi, %rcx
jb .L2
所以,答案不匹配。我检查了内循环中的值并提到,第一个值匹配,但不是其余值。我认为,这个问题可以是mul或div,但我无法找到它。 div不仅可以改变%rdx和%rax吗?在我不知情的情况下,%r8,%r9,...寄存器是否有可能发生变化?也许我在某个地方弄错了。这是一个很好的方法来使用这么多寄存器吗?这必须比使用内存更快,所以我试着这样做。