Question

有人可以帮助我理解此汇编代码吗？我是汇编语言的新手，我只是想不通... 以下汇编代码应产生此功能：

func（int a） {返回* 34}

评论//是我的想法，它应该是什么意思，如果我错了，请纠正我

//esp = stack-pointer, ebp = callee saved, eax = return value

pushl %ebp                   // a is pushed on stack
movl %esp,%ebp               // a = stackpointer
movl 8(%ebp),%eax            // eax = M(8 + a).But what is in M(8 + a)?
sall $4,%eax                 // eax << 4
addl 8(%ebp),%eax            // eax = M(8 + a)
addl %eax,%eax               // eax = eax + eax
movl %ebp,%esp               // eax = t
popl %ebp                    // pop a from stack
ret

有人可以解释一下如何解决吗？非常感谢！

Answer 1

pushl %ebp                   // a is pushed on stack
movl %esp,%ebp               // a = stackpointer

如评论中所述，ebp与a无关。 ebp是堆栈基指针-此代码将ebp的旧值保存到堆栈中，然后将堆栈指针保存在ebp中。

movl 8(%ebp),%eax            // eax = M(8 + a).But what is in M(8 + a)?

正确。堆栈中是eax的输入值。

sall $4,%eax                 // eax << 4

正确。（然后将结果分配回eax。）

addl 8(%ebp),%eax            // eax = M(8 + a)

不，您误解了这一点。这会将堆栈中位于8(ebp)的值（即a的原始值）添加到eax。加法将应用于值，而不是内存地址。

addl %eax,%eax               // eax = eax + eax

正确。 eax的值在此处未作任何修改，因此这是函数的返回值。

movl %ebp,%esp               // eax = t
popl %ebp                    // pop a from stack
ret

此代码反转前两个指令的效果。这是标准的清理序列，与a无关。

此功能的重要部分可以表述为：

a1 = a << 4;   // = a * 16
a2 = a1 + a;   // = a * 17
a3 = a2 + a2;  // = a * 34
return a3;

Answer 2

这是未经优化的疯狂代码，因为您使用-O0进行了编译（快速编译，跳过大多数优化过程）。传统的堆栈帧设置/清除只是噪音。 arg位于返回地址上方的堆栈中，即函数入口上的4(%esp)处。（另请参见How to remove "noise" from GCC/clang assembly output?）

令人惊讶的是，编译器使用3条指令通过移位和加法来进行乘法运算，而不是imull $34, 4(%esp), %eax / ret ，除非针对旧CPU进行调整。 2条指令是现代gcc和clang的默认调整项。例如参见How to multiply a register by 37 using only 2 consecutive leal instructions in x86?

但是这可以使用LEA用2条指令完成（不计算mov来复制寄存器）；该代码is肿，因为您没有进行优化就进行了编译。（或者您已针对旧的CPU进行了调整，可能出于某种原因避免使用LEA。）

我认为您必须为此使用gcc；使用其他编译器禁用优化总是只使用imul乘以非2的幂。但是我在Godbolt编译器资源管理器中找不到能准确提供您代码的gcc版本+选项。我没有尝试所有可能的组合。 MSVC 19.10 -O2使用与您的代码相同的算法，包括两次加载a。

使用gcc5.5（这是最新的gcc，不仅使用imul，甚至使用-O0进行编译），我们也会得到类似您的代码的信息，但并不完全相同。（相同的操作以不同的顺序进行，并且没有两次从存储器中加载a。）

# gcc5.5 -m32 -xc -O0 -fverbose-asm -Wall
func:
    pushl   %ebp  #
    movl    %esp, %ebp      #,            # make a stack frame

    movl    8(%ebp), %eax   # a, tmp89    # load a from the stack, first arg is at EBP+8

    addl    %eax, %eax      # tmp91          # a*2
    movl    %eax, %edx      # tmp90, tmp92
    sall    $4, %edx        #, tmp92         # a*2 << 4 = a*32
    addl    %edx, %eax      # tmp92, D.1807  # a*2 + a*32

    popl    %ebp    #                     # clean up the stack frame
    ret

使用 the Godbolt compiler explorer 上的同一旧版GCC编译进行优化：gcc5.5 -m32 -O3 -fverbose-asm，我们得到：

# gcc5.5 -m32 -O3.   Also clang7.0 -m32 -O3 emits the same code
func:
    movl    4(%esp), %eax   # a, a          # load a from the stack
    movl    %eax, %edx      # a, tmp93      # copy it to edx
    sall    $5, %edx        #, tmp93        # edx = a<<5 = a*32
    leal    (%edx,%eax,2), %eax             # eax = edx + eax*2 = a*32 + a*2 = a*34
    ret              # with a*34 in EAX, the return-value reg in this calling convention

在gcc 6.x或更高版本中，我们获得了这种高效的asm ：imul-具有内存源的即时解码器只能解码为现代Intel CPU上的单个微融合uop，并且自Core2以来，整数乘积在Intel上仅具有3个周期延迟，从Ryzen起，AMD仅具有3个周期延迟。（https://agner.org/optimize/）。

# gcc6/7/8 -m32 -O3     default tuning
func:
    imull   $34, 4(%esp), %eax    #, a, tmp89
    ret

但是使用-mtune=pentium3，我们奇怪地没有获得LEA 。这看起来像是错过的优化。 LEA在Pentium 3 / Pentium-M上具有1个周期的延迟。

# gcc8.2 -O3 -mtune=pentium3 -m32 -xc -fverbose-asm -Wall
func:
    movl    4(%esp), %edx   # a, a
    movl    %edx, %eax      # a, tmp91
    sall    $4, %eax        #, tmp91     # a*16
    addl    %edx, %eax      # a, tmp92   # a*16 + a = a*17
    addl    %eax, %eax      # tmp93      # a*16 * 2 = a*34
    ret

这与您的代码相同，但是使用reg-reg mov而不是从堆栈中重新加载以将a添加到移位结果中。

了解和分析汇编代码

2 个答案: