为什么简单使用ostringstream会产生如此多的汇编代码?

时间:2018-01-28 23:58:25

标签: c++ formatting clang++ ostream code-size

考虑以下使用ostringstream格式化字符串和整数的简单示例并丢弃输出:

#include <sstream>

void ostringstream_test() {
  std::ostringstream ss;
  ss << "x = " << 42;
  ss.str();
}

使用clang++ -S -O3 -DNDEBUG -std=c++14 test.cc进行编译会产生大量的汇编代码(x86-64指令中的半个千字节,而类似的sprintf代码则不到一百个字节) - 请参阅下面的输出。为什么它会生成如此多的代码,它是ostringstream API固有的,还是这个特定的编译器/库做错了?

    .globl  __Z18ostringstream_testv
    .p2align    4, 0x90
__Z18ostringstream_testv:               ## @_Z18ostringstream_testv
Lfunc_begin0:
    .cfi_startproc
    .cfi_personality 155, ___gxx_personality_v0
    .cfi_lsda 16, Lexception0
## BB#0:
    pushq   %rbp
Lcfi0:
    .cfi_def_cfa_offset 16
Lcfi1:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Lcfi2:
    .cfi_def_cfa_register %rbp
    pushq   %r15
    pushq   %r14
    pushq   %r13
    pushq   %r12
    pushq   %rbx
    subq    $328, %rsp              ## imm = 0x148
Lcfi3:
    .cfi_offset %rbx, -56
Lcfi4:
    .cfi_offset %r12, -48
Lcfi5:
    .cfi_offset %r13, -40
Lcfi6:
    .cfi_offset %r14, -32
Lcfi7:
    .cfi_offset %r15, -24
    leaq    -256(%rbp), %r14
    leaq    -360(%rbp), %r12
    movq    __ZTCNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE0_NS_13basic_ostreamIcS2_EE@GOTPCREL(%rip), %rax
    leaq    24(%rax), %rcx
    movq    %rcx, -368(%rbp)
    addq    $64, %rax
    movq    %rax, -256(%rbp)
Ltmp0:
    movq    %r14, %rdi
    movq    %r12, %rsi
    callq   __ZNSt3__18ios_base4initEPv
Ltmp1:
## BB#1:
    movq    $0, -120(%rbp)
    movl    $-1, -112(%rbp)
    movq    __ZTVNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE@GOTPCREL(%rip), %rbx
    leaq    24(%rbx), %r13
    movq    %r13, -368(%rbp)
    addq    $64, %rbx
    movq    %rbx, -256(%rbp)
Ltmp3:
    movq    %r12, %rdi
    callq   __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEEC2Ev
Ltmp4:
## BB#2:
    movq    __ZTVNSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEEE@GOTPCREL(%rip), %r15
    addq    $16, %r15
    movq    %r15, -360(%rbp)
    movq    $0, -272(%rbp)
    movq    $0, -280(%rbp)
    movq    $0, -288(%rbp)
    movq    $0, -296(%rbp)
    movl    $16, -264(%rbp)
    xorps   %xmm0, %xmm0
    movaps  %xmm0, -80(%rbp)
    movq    $0, -64(%rbp)
Ltmp6:
    leaq    -80(%rbp), %rsi
    movq    %r12, %rdi
    callq   __ZNSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEE3strERKNS_12basic_stringIcS2_S4_EE
Ltmp7:
## BB#3:
    testb   $1, -80(%rbp)
    je  LBB0_5
## BB#4:
    movq    -64(%rbp), %rdi
    callq   __ZdlPv
LBB0_5:
Ltmp9:
    leaq    L_.str(%rip), %rsi
    leaq    -368(%rbp), %rdi
    movl    $4, %edx
    callq   __ZNSt3__124__put_character_sequenceIcNS_11char_traitsIcEEEERNS_13basic_ostreamIT_T0_EES7_PKS4_m
Ltmp10:
## BB#6:
Ltmp11:
    movl    $42, %esi
    movq    %rax, %rdi
    callq   __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEElsEi
Ltmp12:
## BB#7:
Ltmp13:
    leaq    -104(%rbp), %rdi
    movq    %r12, %rsi
    callq   __ZNKSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEE3strEv
Ltmp14:
## BB#8:
    testb   $1, -104(%rbp)
    je  LBB0_10
## BB#9:
    movq    -88(%rbp), %rdi
    callq   __ZdlPv
LBB0_10:
    movq    %r13, -368(%rbp)
    movq    %rbx, -256(%rbp)
    movq    %r15, -360(%rbp)
    testb   $1, -296(%rbp)
    je  LBB0_12
## BB#11:
    movq    -280(%rbp), %rdi
    callq   __ZdlPv
LBB0_12:
    movq    %r12, %rdi
    callq   __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEED2Ev
    movq    __ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE@GOTPCREL(%rip), %rsi
    addq    $8, %rsi
    leaq    -368(%rbp), %rdi
    callq   __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEED2Ev
    movq    %r14, %rdi
    callq   __ZNSt3__19basic_iosIcNS_11char_traitsIcEEED2Ev
    addq    $328, %rsp              ## imm = 0x148
    popq    %rbx
    popq    %r12
    popq    %r13
    popq    %r14
    popq    %r15
    popq    %rbp
    retq
LBB0_13:
Ltmp8:
    movq    %rax, -48(%rbp)         ## 8-byte Spill
    testb   $1, -80(%rbp)
    je  LBB0_18
## BB#14:
    movq    -64(%rbp), %rdi
    callq   __ZdlPv
    testb   $1, -296(%rbp)
    jne LBB0_19
    jmp LBB0_20
LBB0_16:
Ltmp5:
    movq    %rax, -48(%rbp)         ## 8-byte Spill
    jmp LBB0_21
LBB0_15:
Ltmp2:
    movq    %rax, -48(%rbp)         ## 8-byte Spill
    jmp LBB0_22
LBB0_17:
Ltmp15:
    movq    %rax, -48(%rbp)         ## 8-byte Spill
    movq    %r13, -368(%rbp)
    movq    %rbx, -256(%rbp)
    movq    %r15, -360(%rbp)
LBB0_18:
    testb   $1, -296(%rbp)
    je  LBB0_20
LBB0_19:
    movq    -280(%rbp), %rdi
    callq   __ZdlPv
LBB0_20:
    movq    %r12, %rdi
    callq   __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEED2Ev
LBB0_21:
    movq    __ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE@GOTPCREL(%rip), %rsi
    addq    $8, %rsi
    leaq    -368(%rbp), %rdi
    callq   __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEED2Ev
LBB0_22:
    movq    %r14, %rdi
    callq   __ZNSt3__19basic_iosIcNS_11char_traitsIcEEED2Ev
    movq    -48(%rbp), %rdi         ## 8-byte Reload
    callq   __Unwind_Resume
Lfunc_end0:
    .cfi_endproc
    .section    __TEXT,__gcc_except_tab
    .p2align    2
GCC_except_table0:
Lexception0:
    .byte   255                     ## @LPStart Encoding = omit
    .byte   155                     ## @TType Encoding = indirect pcrel sdata4
    .asciz  "\303\200"              ## @TType base offset
    .byte   3                       ## Call site Encoding = udata4
    .byte   65                      ## Call site table length
Lset0 = Ltmp0-Lfunc_begin0              ## >> Call Site 1 <<
    .long   Lset0
Lset1 = Ltmp1-Ltmp0                     ##   Call between Ltmp0 and Ltmp1
    .long   Lset1
Lset2 = Ltmp2-Lfunc_begin0              ##     jumps to Ltmp2
    .long   Lset2
    .byte   0                       ##   On action: cleanup
Lset3 = Ltmp3-Lfunc_begin0              ## >> Call Site 2 <<
    .long   Lset3
Lset4 = Ltmp4-Ltmp3                     ##   Call between Ltmp3 and Ltmp4
    .long   Lset4
Lset5 = Ltmp5-Lfunc_begin0              ##     jumps to Ltmp5
    .long   Lset5
    .byte   0                       ##   On action: cleanup
Lset6 = Ltmp6-Lfunc_begin0              ## >> Call Site 3 <<
    .long   Lset6
Lset7 = Ltmp7-Ltmp6                     ##   Call between Ltmp6 and Ltmp7
    .long   Lset7
Lset8 = Ltmp8-Lfunc_begin0              ##     jumps to Ltmp8
    .long   Lset8
    .byte   0                       ##   On action: cleanup
Lset9 = Ltmp9-Lfunc_begin0              ## >> Call Site 4 <<
    .long   Lset9
Lset10 = Ltmp14-Ltmp9                   ##   Call between Ltmp9 and Ltmp14
    .long   Lset10
Lset11 = Ltmp15-Lfunc_begin0            ##     jumps to Ltmp15
    .long   Lset11
    .byte   0                       ##   On action: cleanup
Lset12 = Ltmp14-Lfunc_begin0            ## >> Call Site 5 <<
    .long   Lset12
Lset13 = Lfunc_end0-Ltmp14              ##   Call between Ltmp14 and Lfunc_end0
    .long   Lset13
    .long   0                       ##     has no landing pad
    .byte   0                       ##   On action: cleanup
    .p2align    2

1 个答案:

答案 0 :(得分:2)

差异的最可能原因是IOStream实现是内联扩展的,而sprintf()使用只是一个函数调用。没有什么能阻止IOStream实现库。尽管如此,它确实需要很小的抽象和规划:标准中的定义使用模板。这些通常只是内联实现。然而,将通常使用的实例化(对于字符类型charwchar_t)声明为extern模板并显式实例化它们是额外的工作。我很久以前就证明了它在编译时间方面取得了成功,至少libstdc ++预先实现了库中的IOStreams函数。基于你的实验,似乎libc ++没有。