为什么在Go vs Java中运行这段代码要花费更长的时间

时间:2018-08-22 01:20:08

标签: java performance go

最近开始使用Go,我是Java的忠实拥护者。

我以不同的方式比较了这些语言,并且感到惊讶的是,一个简单的循环(多达200亿个循环)在Golang与Java中花费的时间大大超过了。

想知道是否有人可以提供我是否在此遗漏的任何见解。这就是我所做的:

Java

编写以下代码,使用常规的main()方法执行该代码,使用Gradle构建一个可执行jar,然后使用以下命令从命令行执行该命令:java -jar build/libs/my-executable.jar

private void countToTwentyBillion() {
    long count = 0;
    long start = System.currentTimeMillis();
    for (int k = 0; k < 10; k++) {
        System.out.println("On step " + k);
        for (int i = 0; i < 2_000_000_000; i++) {
            // Do nothing but count
            count++;
        }
    }
    long end = System.currentTimeMillis();
    System.out.println("Total time took: " + (end - start) + " ms to get at count: " + count);
}

在3个不同的试验中,我得到了以下结果:

// Total time took: 396 ms to get at count: 20000000000
// Total time took: 393 ms to get at count: 20000000000
// Total time took: 388 ms to get at count: 20000000000
// 392 ms average

开始

在Go中构建此文件,然后使用“ go build”进行构建,并在命令行中使用./loop-counter

执行
package main

import (
    "fmt"
    "time"
)

func main() {
    count := 0

    nanos := time.Now().UnixNano()
    start := nanos / 1000000

    for i := 0; i < 10; i++ {
        fmt.Printf("On step %d\n", i)
        for k := 0; k < 2000000000; k++ {
            count++
        }
    }
    nanos = time.Now().UnixNano()
    end := nanos / 1000000

    timeLength := end - start

    fmt.Printf("Total time took: %d ms to get at count: %d\n", timeLength, count)

}

经过3次单独的试验,我得到了以下结果:

// Total time took: 5812 ms to get at count: 20000000000
// Total time took: 5834 ms to get at count: 20000000000
// Total time took: 5907 ms to get at count: 20000000000
// 5,851 ms average

我希望Go可以更快并且最终感到惊讶。所有试验均在同一台机器上以相同条件进行。

任何人都可以分辨出什么吗?

谢谢

2 个答案:

答案 0 :(得分:8)

我不是Go专家,但是Java明确地优化了循环。

假设您有一个带有3Ghz的单核处理器,每条指令给您0.3ns的数据,则假设每个增量都是一条指令。因此0.3ns *20 bilion = 6s是无需进行任何优化就可以大致估计的性能。

您可以通过向程序提供-XX:LoopUnrollLimit=1来验证Java在这里是否有些欺骗。这告诉JVM几乎不进行循环展开,因此可以防止大多数JIT优化在您的示例中发生。

这样做,您的Java示例的运行时现在在我的机器上6s,与Go基准相当。

在Go版本中,可能还有一个启用优化的选项,例如循环展开(请咨询Go手册)。

最后,这再次表明,微基准测试很难正确设置。他们常常自欺欺人,以为自己做错了事。

答案 1 :(得分:0)

以下是我所做的一些观察。我将展示一些从编译该程序得到的Intel语法汇编代码。我正在使用Compiler Explorer。要了解后面的内容,您不必了解很多装配,这里最重要的元素是尺寸,尺寸越大,速度越慢。如果可以的话,我可以将其缩小,但是生成的代码令人惊讶地庞大,并且我对Go的了解还不足以知道什么是无用的。如果您想查看每个语句在汇编中所转换的内容,则Compiler Explorer将为您突出显示所有内容。

TL; DR
我认为Go编译器是一个灾难性的混乱,C ++代码得到了很好的优化,并且Java与Go相比很小。 JIT'ing可能会对Java代码产生重大影响,对于循环展开内联优化(预先计算count的值)来说,它可能也太复杂了。


Go代码被编译成这种怪异形式:

text    "".main(SB), $224-0
movq    (TLS), CX
leaq    -96(SP), AX
cmpq    AX, 16(CX)
jls     835
subq    $224, SP
movq    BP, 216(SP)
leaq    216(SP), BP
funcdata        $0, gclocals·f6bd6b3389b872033d462029172c8612(SB)
funcdata        $1, gclocals·17283ea8379a997487dd6f8baf7ae6ea(SB)
pcdata  $0, $0
call    time.Now(SB)
movq    16(SP), AX
movq    8(SP), CX
movq    (SP), DX
movq    DX, time.t·2+160(SP)
movq    CX, time.t·2+168(SP)
movq    AX, time.t·2+176(SP)
movq    time.t·2+160(SP), AX
movq    AX, CX
shrq    $63, AX
shlq    $63, AX
testq   $-1, AX
jeq     806
movq    CX, DX
shlq    $1, CX
shrq    $31, CX
movq    $59453308800, BX
addq    BX, CX
andq    $1073741823, DX
movlqsx DX, DX
imulq   $1000000000, CX
addq    DX, CX
movq    $-6795364578871345152, DX
addq    DX, CX
movq    $4835703278458516699, AX
imulq   CX
sarq    $63, CX
sarq    $18, DX
subq    CX, DX
movq    DX, "".start+72(SP)
xorl    AX, AX
movq    AX, CX
jmp     257
incq    CX
incq    AX
cmpq    CX, $2000000000
jlt     213
movq    "".i+80(SP), SI
incq    SI
movq    "".start+72(SP), DX
movq    $59453308800, BX
movq    AX, CX
movq    SI, AX
movq    CX, "".count+88(SP)
cmpq    AX, $10
jge     404
movq    AX, "".i+80(SP)
movq    AX, ""..autotmp_24+112(SP)
xorps   X0, X0
movups  X0, ""..autotmp_23+120(SP)
leaq    type.int(SB), CX
movq    CX, (SP)
leaq    ""..autotmp_24+112(SP), DX
movq    DX, 8(SP)
pcdata  $0, $1
call    runtime.convT2E64(SB)
movq    24(SP), AX
movq    16(SP), CX
movq    CX, ""..autotmp_23+120(SP)
movq    AX, ""..autotmp_23+128(SP)
leaq    go.string."On step %d\n"(SB), AX
movq    AX, (SP)
movq    $11, 8(SP)
leaq    ""..autotmp_23+120(SP), CX
movq    CX, 16(SP)
movq    $1, 24(SP)
movq    $1, 32(SP)
pcdata  $0, $1
call    fmt.Printf(SB)
movq    "".count+88(SP), AX
xorl    CX, CX
jmp     219
pcdata  $0, $2
call    time.Now(SB)
movq    16(SP), AX
movq    8(SP), CX
movq    (SP), DX
movq    DX, time.t·2+136(SP)
movq    CX, time.t·2+144(SP)
movq    AX, time.t·2+152(SP)
movq    time.t·2+136(SP), AX
movq    AX, CX
shrq    $63, AX
shlq    $63, AX
testq   $-1, AX
jeq     787
movq    CX, DX
shlq    $1, CX
shrq    $31, CX
movq    $59453308800, BX
addq    BX, CX
imulq   $1000000000, CX
andq    $1073741823, DX
movlqsx DX, DX
addq    DX, CX
movq    $-6795364578871345152, DX
leaq    (DX)(CX*1), AX
movq    AX, "".~R0+64(SP)
movq    $4835703278458516699, CX
imulq   CX
sarq    $18, DX
movq    "".~R0+64(SP), CX
sarq    $63, CX
subq    CX, DX
movq    "".start+72(SP), CX
subq    CX, DX
movq    DX, ""..autotmp_29+104(SP)
movq    "".count+88(SP), CX
movq    CX, ""..autotmp_30+96(SP)
xorps   X0, X0
movups  X0, ""..autotmp_28+184(SP)
movups  X0, ""..autotmp_28+200(SP)
leaq    type.int64(SB), CX
movq    CX, (SP)
leaq    ""..autotmp_29+104(SP), CX
movq    CX, 8(SP)
pcdata  $0, $3
call    runtime.convT2E64(SB)
movq    16(SP), CX
movq    24(SP), DX
movq    CX, ""..autotmp_28+184(SP)
movq    DX, ""..autotmp_28+192(SP)
leaq    type.int(SB), CX
movq    CX, (SP)
leaq    ""..autotmp_30+96(SP), CX
movq    CX, 8(SP)
pcdata  $0, $3
call    runtime.convT2E64(SB)
movq    24(SP), CX
movq    16(SP), DX
movq    DX, ""..autotmp_28+200(SP)
movq    CX, ""..autotmp_28+208(SP)
leaq    go.string."Total time took: %d to get at count: %d\n"(SB), CX
movq    CX, (SP)
movq    $40, 8(SP)
leaq    ""..autotmp_28+184(SP), CX
movq    CX, 16(SP)
movq    $2, 24(SP)
movq    $2, 32(SP)
pcdata  $0, $3
call    fmt.Printf(SB)
movq    216(SP), BP
addq    $224, SP
ret
movq    time.t·2+144(SP), BX
movq    CX, DX
movq    BX, CX
jmp     501
movq    time.t·2+168(SP), SI
movq    CX, DX
movq    $59453308800, BX
movq    SI, CX
jmp     144
nop
pcdata  $0, $-1
call    runtime.morestack_noctxt(SB)
jmp     0
text    "".init(SB), $8-0
movq    (TLS), CX
cmpq    SP, 16(CX)
jls     89
subq    $8, SP
movq    BP, (SP)
leaq    (SP), BP
funcdata        $0, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
funcdata        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
movblzx "".initdone·(SB), AX
cmpb    AL, $1
jls     47
movq    (SP), BP
addq    $8, SP
ret
jne     56
pcdata  $0, $0
call    runtime.throwinit(SB)
undef
movb    $1, "".initdone·(SB)
pcdata  $0, $0
call    fmt.init(SB)
pcdata  $0, $0
call    time.init(SB)
movb    $2, "".initdone·(SB)
movq    (SP), BP
addq    $8, SP
ret
nop
pcdata  $0, $-1
call    runtime.morestack_noctxt(SB)
jmp     0
text    type..hash.[2]interface {}(SB), DUPOK, $40-24
movq    (TLS), CX
cmpq    SP, 16(CX)
jls     103
subq    $40, SP
movq    BP, 32(SP)
leaq    32(SP), BP
funcdata        $0, gclocals·d4dc2f11db048877dbc0f60a22b4adb3(SB)
funcdata        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
xorl    AX, AX
movq    "".h+56(SP), CX
jmp     82
movq    AX, "".i+24(SP)
shlq    $4, AX
movq    "".p+48(SP), BX
addq    BX, AX
movq    AX, (SP)
movq    CX, 8(SP)
pcdata  $0, $0
call    runtime.nilinterhash(SB)
movq    16(SP), CX
movq    "".i+24(SP), AX
incq    AX
cmpq    AX, $2
jlt     38
movq    CX, "".~r2+64(SP)
movq    32(SP), BP
addq    $40, SP
ret
nop
pcdata  $0, $-1
call    runtime.morestack_noctxt(SB)
jmp     0
text    type..eq.[2]interface {}(SB), DUPOK, $48-24
movq    (TLS), CX
cmpq    SP, 16(CX)
jls     155
subq    $48, SP
movq    BP, 40(SP)
leaq    40(SP), BP
funcdata        $0, gclocals·8f9cec06d1ae35cc9900c511c5e4bdab(SB)
funcdata        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
xorl    AX, AX
jmp     46
movq    ""..autotmp_8+32(SP), CX
leaq    1(CX), AX
cmpq    AX, $2
jge     140
movq    AX, CX
shlq    $4, AX
movq    "".p+56(SP), DX
movq    8(AX)(DX*1), BX
movq    (AX)(DX*1), SI
movq    "".q+64(SP), DI
movq    8(AX)(DI*1), R8
movq    (AX)(DI*1), AX
cmpq    SI, AX
jne     125
movq    CX, ""..autotmp_8+32(SP)
movq    SI, (SP)
movq    BX, 8(SP)
movq    R8, 16(SP)
pcdata  $0, $0
call    runtime.efaceeq(SB)
movblzx 24(SP), AX
testb   AL, AL
jne     37
movb    $0, "".~r2+72(SP)
movq    40(SP), BP
addq    $48, SP
ret
movb    $1, "".~r2+72(SP)
movq    40(SP), BP
addq    $48, SP
ret
nop
pcdata  $0, $-1
call    runtime.morestack_noctxt(SB)
jmp     0

我不知道大多数事情在做什么。我只能希望其中大多数是某种GC代码。我查看了如何为Go编译器启用优化,而我所能找到的就是如何禁用优化。

相比之下,我看了C ++中的类似函数

#include <cstdio>
#include <chrono>
#include <cinttypes>

using namespace std::chrono;


milliseconds getMS()
{
    return duration_cast< milliseconds >(
        system_clock::now().time_since_epoch()
    );
}

int main()
{
    int count = 0;
    milliseconds millis = getMS();

    for(int i = 0; i < 10; ++i)
    {
        printf("On step %d\n", i);
        for(int j = 0; j < 2000000000; ++j)
        {
            ++count;
        }
    }

    milliseconds time = getMS() - millis;

    printf("Total time took: %" PRId64 " to get at count: %d\n", time.count(), count);
}

没有优化的情况会被编译为(编译器x86-64 clang (trunk (probably 6.0.0),标志:-std=c++0x -O0):

main:                                   # @main
        push    rbp
        mov     rbp, rsp
        sub     rsp, 48
        mov     dword ptr [rbp - 4], 0
        mov     dword ptr [rbp - 8], 0
        call    getMS()
        mov     qword ptr [rbp - 16], rax
        mov     dword ptr [rbp - 20], 0
.LBB3_1:                                # =>This Loop Header: Depth=1
        cmp     dword ptr [rbp - 20], 10
        jge     .LBB3_8
        mov     esi, dword ptr [rbp - 20]
        movabs  rdi, offset .L.str
        mov     al, 0
        call    printf
        mov     dword ptr [rbp - 24], 0
        mov     dword ptr [rbp - 44], eax # 4-byte Spill
.LBB3_3:                                #   Parent Loop BB3_1 Depth=1
        cmp     dword ptr [rbp - 24], 2000000000
        jge     .LBB3_6
        mov     eax, dword ptr [rbp - 8]
        add     eax, 1
        mov     dword ptr [rbp - 8], eax
        mov     eax, dword ptr [rbp - 24]
        add     eax, 1
        mov     dword ptr [rbp - 24], eax
        jmp     .LBB3_3
.LBB3_6:                                #   in Loop: Header=BB3_1 Depth=1
        jmp     .LBB3_7
.LBB3_7:                                #   in Loop: Header=BB3_1 Depth=1
        mov     eax, dword ptr [rbp - 20]
        add     eax, 1
        mov     dword ptr [rbp - 20], eax
        jmp     .LBB3_1
.LBB3_8:
        call    getMS()
        mov     qword ptr [rbp - 40], rax
        lea     rdi, [rbp - 40]
        lea     rsi, [rbp - 16]
        call    std::common_type<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::operator-<long, std::ratio<1l, 1000l>, long, std::ratio<1l, 1000l> >(std::chrono::duration<long, std::ratio<1l, 1000l> > const&, std::chrono::duration<long, std::ratio<1l, 1000l> > const&)
        mov     qword ptr [rbp - 32], rax
        lea     rdi, [rbp - 32]
        call    std::chrono::duration<long, std::ratio<1l, 1000l> >::count() const
        mov     edx, dword ptr [rbp - 8]
        movabs  rdi, offset .L.str.1
        mov     rsi, rax
        mov     al, 0
        call    printf
        mov     edx, dword ptr [rbp - 4]
        mov     dword ptr [rbp - 48], eax # 4-byte Spill
        mov     eax, edx
        add     rsp, 48
        pop     rbp
        ret
.L.str:
        .asciz  "On step %d\n"

.L.str.1:
        .asciz  "Total time took: %ld to get at count: %d\n"

实际上有比这更多的代码,但这只是chrono实现,在优化的代码中,这只是一个库函数调用。我还删除了getMS的实现,因为它主要是包装方法。

通过O1(大小)优化,结果变为:

main:                                   # @main
        push    rbx
        sub     rsp, 32
        call    getMS()
        mov     qword ptr [rsp + 24], rax
        xor     ebx, ebx
.LBB3_1:                                # =>This Inner Loop Header: Depth=1
        mov     edi, offset .L.str
        xor     eax, eax
        mov     esi, ebx
        call    printf
        add     ebx, 1
        cmp     ebx, 10
        jne     .LBB3_1
        call    getMS()
        mov     qword ptr [rsp + 8], rax
        lea     rdi, [rsp + 8]
        lea     rsi, [rsp + 24]
        call    std::common_type<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::operator-<long, std::ratio<1l, 1000l>, long, std::ratio<1l, 1000l> >(std::chrono::duration<long, std::ratio<1l, 1000l> > const&, std::chrono::duration<long, std::ratio<1l, 1000l> > const&)
        mov     qword ptr [rsp + 16], rax
        lea     rdi, [rsp + 16]
        call    std::chrono::duration<long, std::ratio<1l, 1000l> >::count() const
        mov     rcx, rax
        mov     edi, offset .L.str.1
        mov     edx, -1474836480
        xor     eax, eax
        mov     rsi, rcx
        call    printf
        xor     eax, eax
        add     rsp, 32
        pop     rbx
        ret
.L.str:
        .asciz  "On step %d\n"

.L.str.1:
        .asciz  "Total time took: %ld to get at count: %d\n"

O2(速度)和O3(最大)优化本质上可以归结为展开的外循环(仅在print语句中存在)和预先计算的计数值。

这主要显示了Go生成的可怕代码以及C ++中发生的一些优化。但是,这些都没有确切显示Java字节码包含的内容,或者如果运行足够的次数,它将使JIT下降的原因。这是Java字节码:

public static void countToTwentyBillion();
    Code:

0: lconst_0

1: lstore_0

2: invokestatic  #2
// Method java/lang/System.currentTimeMillis:()J

5: lstore_2

6: iconst_0

7: istore
4

9: iload
4

11: bipush
10

13: if_icmpge
68

16: getstatic
#3
// Field java/lang/System.out:Ljava/io/PrintStream;

19: new
#4
// class java/lang/StringBuilder

22: dup

23: invokespecial #5
// Method java/lang/StringBuilder.'<init>':()V

26: ldc
#6
// String On step

28: invokevirtual #7
// Method java/lang/StringBuilder.append:(Ljava/lang/String;)Ljava/lang/StringBuilder;

31: iload
4

33: invokevirtual #8
// Method java/lang/StringBuilder.append:(I)Ljava/lang/StringBuilder;

36: invokevirtual #9
// Method java/lang/StringBuilder.toString:()Ljava/lang/String;

39: invokevirtual #10
// Method java/io/PrintStream.println:(Ljava/lang/String;)V

42: iconst_0

43: istore
5

45: iload
5

47: ldc
#11
// int 2000000000

49: if_icmpge
62

52: lload_0

53: lconst_1

54: ladd

55: lstore_0

56: iinc
5, 1

59: goto
45

62: iinc
4, 1

65: goto
9

68: invokestatic  #2
// Method java/lang/System.currentTimeMillis:()J

71: lstore
4

73: getstatic
#3
// Field java/lang/System.out:Ljava/io/PrintStream;

76: new
#4
// class java/lang/StringBuilder

79: dup

80: invokespecial #5
// Method java/lang/StringBuilder.'<init>':()V

83: ldc
#12
// String Total time took:

85: invokevirtual #7
// Method java/lang/StringBuilder.append:(Ljava/lang/String;)Ljava/lang/StringBuilder;

88: lload
4

90: lload_2

91: lsub

92: invokevirtual #13
// Method java/lang/StringBuilder.append:(J)Ljava/lang/StringBuilder;

95: ldc
#14
// String  ms to get at count:

97: invokevirtual #7
// Method java/lang/StringBuilder.append:(Ljava/lang/String;)Ljava/lang/StringBuilder;

100: lload_0

101: invokevirtual #13
// Method java/lang/StringBuilder.append:(J)Ljava/lang/StringBuilder;

104: invokevirtual #9
// Method java/lang/StringBuilder.toString:()Ljava/lang/String;

107: invokevirtual #10
// Method java/io/PrintStream.println:(Ljava/lang/String;)V

110: return

不幸的是,目前我不喜欢编译hsdis和JIT代码,但最终可能看起来像某些C ++示例。据我了解,JIT 可能可以预先计算计数值。但是,此代码有点复杂(就循环而言),这可能会使快速JIT优化变得更加困难。