Question

上下文

Linux 64bit。 GCC 4.8.2。

气体组装。 AT＆amp; T语法。

代码：

  int operand1, operand2, sum, accumulator;

  operand1 = 10; operand2 = 15;

  __asm__ volatile ("movl %1, %0\n\t"
           "addl %2, %0"
     : "=r" (sum)     /* output operands */
     : "r" (operand1), "r" (operand2) /* input operands */
     : "0");        /* clobbered operands */

  accumulator = sum;

  __asm__ volatile ("addl %1, %0\n\t"
     "addl %2, %0"
     : "=r" (accumulator)
     : "0" (accumulator), "r" (operand1), "r" (operand2)
     : "0");

当然是编译无优化。

我使用valgrind --tool=cachegrind ./my_bin

进行了实验

实际上，如果我更换

"0" (accumulator), "r" (operand1), "r" (operand2)

用

"0" (accumulator), "m" (operand1), "m" (operand2)

我得到少一条指令==保存了一个cpu周期，因为没有注册表操作

现在，替换

"0" (accumulator), "r" (operand1), "r" (operand2)

使用

"r" (accumulator), "r" (operand1), "r" (operand2)

我也被削减了1个cpu周期。

所以

"r" (accumulator), "m" (operand1), "m" (operand2)

保存2个cpu周期。

问题

1）如果他们放慢速度，我们为什么要使用至少一个寄存器？真的存在覆盖风险吗？

2）为什么这样做＆＃34; 0＆＃34;而不是＆＃34; r＆＃34;放慢脚步？这对我来说是不合逻辑的，因为我们只引用相同的值（即累加器）。 GCC不应该输出不同的代码！＆＃34; R＆＃34;可能意味着选择另一个注册表 - ＆gt;胡说八道＆amp;＆amp;慢。

Answer 1

没有进入asm tutorial，我认为在使用和不使用优化的情况下查看代码生成可能会更好。我使用OSX，它与x86-64 Linux基本相同。

首先：您正在寻找sum <- op1 + op2，

后跟：acc <- sum; acc <- acc + op1 + op2，
我们可以替换为：acc <- sum + op1 + op2;不需要：acc = sum;
（顺便说一下，op1, op2分别是%2, %3，而%1＆＃39;别名＆＃39; %0）

这仍然不是内联汇编的一种特别有效的使用方式，而只是为了将某些内容修复为可以检查的东西：

int test_fn (void)
{
    int op1 = 10, op2 = 15, sum, acc;

    __asm__ ("movl %k1, %k0\n\taddl %k2, %k0"
             : "=&r" (sum) : "r" (op1), "r" (op2));

    __asm__ ("addl %k2, %k0\n\taddl %k3, %k0"
             : "=r" (acc) : "0" (sum), "r" (op1), "r" (op2));

    return acc;
}

没有优化：gcc -Wall -c -S src.c（评论是我的）

        pushq   %rbp
        movq    %rsp, %rbp

        movl    $10, -4(%rbp)   # store 10 -> mem (op1)
        movl    $15, -8(%rbp)   # store 15 -> mem (op2)
# asm(1)
        movl    -4(%rbp), %edx  # load op1 -> reg (%1)
        movl    -8(%rbp), %ecx  # load op2 -> reg (%2)
        movl %edx, %eax         # mov %1 to %0
        addl %ecx, %eax         # add %2 to %0
        movl    %eax, -12(%rbp) # store %0 -> mem (sum)
# asm(2)
        movl    -12(%rbp), %eax # load sum -> reg (%1 = %0)
        movl    -4(%rbp), %edx  # load op1 -> reg (%2)
        movl    -8(%rbp), %ecx  # load op2 -> reg (%3)
        addl %edx, %eax         # add %2 to %0
        addl %ecx, %eax         # add %3 to %0
        movl    %eax, -16(%rbp) # store %0 -> mem (acc)

        movl    -16(%rbp), %eax # load acc -> return value.
        popq    %rbp
        ret

编译器没有努力将中间结果保存在寄存器中。它只是将它们保存回堆栈上的临时内存，并根据需要再次加载。虽然这很容易理解。

我们将您的更改应用于asm（2）输入："0" (sum), "m" (op1), "m" (op2)

        ...
# asm(2)
        movl    -4(%rbp), %eax  # load sum -> reg (%1 = %0)
        addl -12(%rbp), %eax    # add op1 (mem) to %0
        addl -16(%rbp), %eax    # add op2 (mem) to %0
        movl    %eax, -8(%rbp)  # store %0 -> mem (acc)
        ...

内存位置略有不同，但这并不重要。 add与reg <- reg + mem形式的事实意味着我们不需要首先加载到寄存器。确实它确实保存了一条指令，但我们仍在读取和写入内存。

通过优化：gcc -Wall -O2 -c -S src.c

        movl    $10, %edx
        movl    $15, %ecx
# asm(1)
        movl %edx, %eax
        addl %ecx, %eax
# asm(2)
        addl %edx, %eax
        addl %ecx, %eax

        ret

没有内存访问权限。一切都在寄存器中完成。它的速度和它一样快。没有缓存访问，没有主内存等。如果我们应用更改来使用"m"约束，就像我们在未优化的情况下那样：

        movl    $10, -8(%rsp)
        movl    $15, %ecx
        movl    $10, %edx
        movl    $15, -4(%rsp)
# asm(1)
        movl %edx, %eax
        addl %ecx, %eax
# asm(2)
        addl -8(%rsp), %eax
        addl -4(%rsp), %eax

        ret

我们回到强制使用内存。无需存储和加载asm（2）的操作数。并不是说valgrind是错的 - 只是推断注册使用是导致事情变慢的原因。

扩展的asm - 注册约束表现奇怪吗？

上下文

问题

1 个答案: