最近开始使用Go,我是Java的忠实拥护者。
我以不同的方式比较了这些语言,并且感到惊讶的是,一个简单的循环(多达200亿个循环)在Golang与Java中花费的时间大大超过了。
想知道是否有人可以提供我是否在此遗漏的任何见解。这就是我所做的:
编写以下代码,使用常规的main()
方法执行该代码,使用Gradle构建一个可执行jar,然后使用以下命令从命令行执行该命令:java -jar build/libs/my-executable.jar
private void countToTwentyBillion() {
long count = 0;
long start = System.currentTimeMillis();
for (int k = 0; k < 10; k++) {
System.out.println("On step " + k);
for (int i = 0; i < 2_000_000_000; i++) {
// Do nothing but count
count++;
}
}
long end = System.currentTimeMillis();
System.out.println("Total time took: " + (end - start) + " ms to get at count: " + count);
}
在3个不同的试验中,我得到了以下结果:
// Total time took: 396 ms to get at count: 20000000000
// Total time took: 393 ms to get at count: 20000000000
// Total time took: 388 ms to get at count: 20000000000
// 392 ms average
在Go中构建此文件,然后使用“ go build”进行构建,并在命令行中使用./loop-counter
package main
import (
"fmt"
"time"
)
func main() {
count := 0
nanos := time.Now().UnixNano()
start := nanos / 1000000
for i := 0; i < 10; i++ {
fmt.Printf("On step %d\n", i)
for k := 0; k < 2000000000; k++ {
count++
}
}
nanos = time.Now().UnixNano()
end := nanos / 1000000
timeLength := end - start
fmt.Printf("Total time took: %d ms to get at count: %d\n", timeLength, count)
}
经过3次单独的试验,我得到了以下结果:
// Total time took: 5812 ms to get at count: 20000000000
// Total time took: 5834 ms to get at count: 20000000000
// Total time took: 5907 ms to get at count: 20000000000
// 5,851 ms average
我希望Go可以更快并且最终感到惊讶。所有试验均在同一台机器上以相同条件进行。
任何人都可以分辨出什么吗?
谢谢
答案 0 :(得分:8)
我不是Go专家,但是Java明确地优化了循环。
假设您有一个带有3Ghz
的单核处理器,每条指令给您0.3ns的数据,则假设每个增量都是一条指令。因此0.3ns *20 bilion = 6s
是无需进行任何优化就可以大致估计的性能。
您可以通过向程序提供-XX:LoopUnrollLimit=1
来验证Java在这里是否有些欺骗。这告诉JVM几乎不进行循环展开,因此可以防止大多数JIT优化在您的示例中发生。
这样做,您的Java示例的运行时现在在我的机器上6s
,与Go基准相当。
在Go版本中,可能还有一个启用优化的选项,例如循环展开(请咨询Go手册)。
最后,这再次表明,微基准测试很难正确设置。他们常常自欺欺人,以为自己做错了事。
答案 1 :(得分:0)
以下是我所做的一些观察。我将展示一些从编译该程序得到的Intel语法汇编代码。我正在使用Compiler Explorer。要了解后面的内容,您不必了解很多装配,这里最重要的元素是尺寸,尺寸越大,速度越慢。如果可以的话,我可以将其缩小,但是生成的代码令人惊讶地庞大,并且我对Go的了解还不足以知道什么是无用的。如果您想查看每个语句在汇编中所转换的内容,则Compiler Explorer将为您突出显示所有内容。
TL; DR :
我认为Go编译器是一个灾难性的混乱,C ++代码得到了很好的优化,并且Java与Go相比很小。 JIT'ing可能会对Java代码产生重大影响,对于循环展开内联优化(预先计算count
的值)来说,它可能也太复杂了。
Go代码被编译成这种怪异形式:
text "".main(SB), $224-0
movq (TLS), CX
leaq -96(SP), AX
cmpq AX, 16(CX)
jls 835
subq $224, SP
movq BP, 216(SP)
leaq 216(SP), BP
funcdata $0, gclocals·f6bd6b3389b872033d462029172c8612(SB)
funcdata $1, gclocals·17283ea8379a997487dd6f8baf7ae6ea(SB)
pcdata $0, $0
call time.Now(SB)
movq 16(SP), AX
movq 8(SP), CX
movq (SP), DX
movq DX, time.t·2+160(SP)
movq CX, time.t·2+168(SP)
movq AX, time.t·2+176(SP)
movq time.t·2+160(SP), AX
movq AX, CX
shrq $63, AX
shlq $63, AX
testq $-1, AX
jeq 806
movq CX, DX
shlq $1, CX
shrq $31, CX
movq $59453308800, BX
addq BX, CX
andq $1073741823, DX
movlqsx DX, DX
imulq $1000000000, CX
addq DX, CX
movq $-6795364578871345152, DX
addq DX, CX
movq $4835703278458516699, AX
imulq CX
sarq $63, CX
sarq $18, DX
subq CX, DX
movq DX, "".start+72(SP)
xorl AX, AX
movq AX, CX
jmp 257
incq CX
incq AX
cmpq CX, $2000000000
jlt 213
movq "".i+80(SP), SI
incq SI
movq "".start+72(SP), DX
movq $59453308800, BX
movq AX, CX
movq SI, AX
movq CX, "".count+88(SP)
cmpq AX, $10
jge 404
movq AX, "".i+80(SP)
movq AX, ""..autotmp_24+112(SP)
xorps X0, X0
movups X0, ""..autotmp_23+120(SP)
leaq type.int(SB), CX
movq CX, (SP)
leaq ""..autotmp_24+112(SP), DX
movq DX, 8(SP)
pcdata $0, $1
call runtime.convT2E64(SB)
movq 24(SP), AX
movq 16(SP), CX
movq CX, ""..autotmp_23+120(SP)
movq AX, ""..autotmp_23+128(SP)
leaq go.string."On step %d\n"(SB), AX
movq AX, (SP)
movq $11, 8(SP)
leaq ""..autotmp_23+120(SP), CX
movq CX, 16(SP)
movq $1, 24(SP)
movq $1, 32(SP)
pcdata $0, $1
call fmt.Printf(SB)
movq "".count+88(SP), AX
xorl CX, CX
jmp 219
pcdata $0, $2
call time.Now(SB)
movq 16(SP), AX
movq 8(SP), CX
movq (SP), DX
movq DX, time.t·2+136(SP)
movq CX, time.t·2+144(SP)
movq AX, time.t·2+152(SP)
movq time.t·2+136(SP), AX
movq AX, CX
shrq $63, AX
shlq $63, AX
testq $-1, AX
jeq 787
movq CX, DX
shlq $1, CX
shrq $31, CX
movq $59453308800, BX
addq BX, CX
imulq $1000000000, CX
andq $1073741823, DX
movlqsx DX, DX
addq DX, CX
movq $-6795364578871345152, DX
leaq (DX)(CX*1), AX
movq AX, "".~R0+64(SP)
movq $4835703278458516699, CX
imulq CX
sarq $18, DX
movq "".~R0+64(SP), CX
sarq $63, CX
subq CX, DX
movq "".start+72(SP), CX
subq CX, DX
movq DX, ""..autotmp_29+104(SP)
movq "".count+88(SP), CX
movq CX, ""..autotmp_30+96(SP)
xorps X0, X0
movups X0, ""..autotmp_28+184(SP)
movups X0, ""..autotmp_28+200(SP)
leaq type.int64(SB), CX
movq CX, (SP)
leaq ""..autotmp_29+104(SP), CX
movq CX, 8(SP)
pcdata $0, $3
call runtime.convT2E64(SB)
movq 16(SP), CX
movq 24(SP), DX
movq CX, ""..autotmp_28+184(SP)
movq DX, ""..autotmp_28+192(SP)
leaq type.int(SB), CX
movq CX, (SP)
leaq ""..autotmp_30+96(SP), CX
movq CX, 8(SP)
pcdata $0, $3
call runtime.convT2E64(SB)
movq 24(SP), CX
movq 16(SP), DX
movq DX, ""..autotmp_28+200(SP)
movq CX, ""..autotmp_28+208(SP)
leaq go.string."Total time took: %d to get at count: %d\n"(SB), CX
movq CX, (SP)
movq $40, 8(SP)
leaq ""..autotmp_28+184(SP), CX
movq CX, 16(SP)
movq $2, 24(SP)
movq $2, 32(SP)
pcdata $0, $3
call fmt.Printf(SB)
movq 216(SP), BP
addq $224, SP
ret
movq time.t·2+144(SP), BX
movq CX, DX
movq BX, CX
jmp 501
movq time.t·2+168(SP), SI
movq CX, DX
movq $59453308800, BX
movq SI, CX
jmp 144
nop
pcdata $0, $-1
call runtime.morestack_noctxt(SB)
jmp 0
text "".init(SB), $8-0
movq (TLS), CX
cmpq SP, 16(CX)
jls 89
subq $8, SP
movq BP, (SP)
leaq (SP), BP
funcdata $0, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
funcdata $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
movblzx "".initdone·(SB), AX
cmpb AL, $1
jls 47
movq (SP), BP
addq $8, SP
ret
jne 56
pcdata $0, $0
call runtime.throwinit(SB)
undef
movb $1, "".initdone·(SB)
pcdata $0, $0
call fmt.init(SB)
pcdata $0, $0
call time.init(SB)
movb $2, "".initdone·(SB)
movq (SP), BP
addq $8, SP
ret
nop
pcdata $0, $-1
call runtime.morestack_noctxt(SB)
jmp 0
text type..hash.[2]interface {}(SB), DUPOK, $40-24
movq (TLS), CX
cmpq SP, 16(CX)
jls 103
subq $40, SP
movq BP, 32(SP)
leaq 32(SP), BP
funcdata $0, gclocals·d4dc2f11db048877dbc0f60a22b4adb3(SB)
funcdata $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
xorl AX, AX
movq "".h+56(SP), CX
jmp 82
movq AX, "".i+24(SP)
shlq $4, AX
movq "".p+48(SP), BX
addq BX, AX
movq AX, (SP)
movq CX, 8(SP)
pcdata $0, $0
call runtime.nilinterhash(SB)
movq 16(SP), CX
movq "".i+24(SP), AX
incq AX
cmpq AX, $2
jlt 38
movq CX, "".~r2+64(SP)
movq 32(SP), BP
addq $40, SP
ret
nop
pcdata $0, $-1
call runtime.morestack_noctxt(SB)
jmp 0
text type..eq.[2]interface {}(SB), DUPOK, $48-24
movq (TLS), CX
cmpq SP, 16(CX)
jls 155
subq $48, SP
movq BP, 40(SP)
leaq 40(SP), BP
funcdata $0, gclocals·8f9cec06d1ae35cc9900c511c5e4bdab(SB)
funcdata $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
xorl AX, AX
jmp 46
movq ""..autotmp_8+32(SP), CX
leaq 1(CX), AX
cmpq AX, $2
jge 140
movq AX, CX
shlq $4, AX
movq "".p+56(SP), DX
movq 8(AX)(DX*1), BX
movq (AX)(DX*1), SI
movq "".q+64(SP), DI
movq 8(AX)(DI*1), R8
movq (AX)(DI*1), AX
cmpq SI, AX
jne 125
movq CX, ""..autotmp_8+32(SP)
movq SI, (SP)
movq BX, 8(SP)
movq R8, 16(SP)
pcdata $0, $0
call runtime.efaceeq(SB)
movblzx 24(SP), AX
testb AL, AL
jne 37
movb $0, "".~r2+72(SP)
movq 40(SP), BP
addq $48, SP
ret
movb $1, "".~r2+72(SP)
movq 40(SP), BP
addq $48, SP
ret
nop
pcdata $0, $-1
call runtime.morestack_noctxt(SB)
jmp 0
我不知道大多数事情在做什么。我只能希望其中大多数是某种GC代码。我查看了如何为Go编译器启用优化,而我所能找到的就是如何禁用优化。
相比之下,我看了C ++中的类似函数
#include <cstdio>
#include <chrono>
#include <cinttypes>
using namespace std::chrono;
milliseconds getMS()
{
return duration_cast< milliseconds >(
system_clock::now().time_since_epoch()
);
}
int main()
{
int count = 0;
milliseconds millis = getMS();
for(int i = 0; i < 10; ++i)
{
printf("On step %d\n", i);
for(int j = 0; j < 2000000000; ++j)
{
++count;
}
}
milliseconds time = getMS() - millis;
printf("Total time took: %" PRId64 " to get at count: %d\n", time.count(), count);
}
没有优化的情况会被编译为(编译器x86-64 clang (trunk (probably 6.0.0)
,标志:-std=c++0x -O0
):
main: # @main
push rbp
mov rbp, rsp
sub rsp, 48
mov dword ptr [rbp - 4], 0
mov dword ptr [rbp - 8], 0
call getMS()
mov qword ptr [rbp - 16], rax
mov dword ptr [rbp - 20], 0
.LBB3_1: # =>This Loop Header: Depth=1
cmp dword ptr [rbp - 20], 10
jge .LBB3_8
mov esi, dword ptr [rbp - 20]
movabs rdi, offset .L.str
mov al, 0
call printf
mov dword ptr [rbp - 24], 0
mov dword ptr [rbp - 44], eax # 4-byte Spill
.LBB3_3: # Parent Loop BB3_1 Depth=1
cmp dword ptr [rbp - 24], 2000000000
jge .LBB3_6
mov eax, dword ptr [rbp - 8]
add eax, 1
mov dword ptr [rbp - 8], eax
mov eax, dword ptr [rbp - 24]
add eax, 1
mov dword ptr [rbp - 24], eax
jmp .LBB3_3
.LBB3_6: # in Loop: Header=BB3_1 Depth=1
jmp .LBB3_7
.LBB3_7: # in Loop: Header=BB3_1 Depth=1
mov eax, dword ptr [rbp - 20]
add eax, 1
mov dword ptr [rbp - 20], eax
jmp .LBB3_1
.LBB3_8:
call getMS()
mov qword ptr [rbp - 40], rax
lea rdi, [rbp - 40]
lea rsi, [rbp - 16]
call std::common_type<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::operator-<long, std::ratio<1l, 1000l>, long, std::ratio<1l, 1000l> >(std::chrono::duration<long, std::ratio<1l, 1000l> > const&, std::chrono::duration<long, std::ratio<1l, 1000l> > const&)
mov qword ptr [rbp - 32], rax
lea rdi, [rbp - 32]
call std::chrono::duration<long, std::ratio<1l, 1000l> >::count() const
mov edx, dword ptr [rbp - 8]
movabs rdi, offset .L.str.1
mov rsi, rax
mov al, 0
call printf
mov edx, dword ptr [rbp - 4]
mov dword ptr [rbp - 48], eax # 4-byte Spill
mov eax, edx
add rsp, 48
pop rbp
ret
.L.str:
.asciz "On step %d\n"
.L.str.1:
.asciz "Total time took: %ld to get at count: %d\n"
实际上有比这更多的代码,但这只是chrono实现,在优化的代码中,这只是一个库函数调用。我还删除了getMS
的实现,因为它主要是包装方法。
通过O1(大小)优化,结果变为:
main: # @main
push rbx
sub rsp, 32
call getMS()
mov qword ptr [rsp + 24], rax
xor ebx, ebx
.LBB3_1: # =>This Inner Loop Header: Depth=1
mov edi, offset .L.str
xor eax, eax
mov esi, ebx
call printf
add ebx, 1
cmp ebx, 10
jne .LBB3_1
call getMS()
mov qword ptr [rsp + 8], rax
lea rdi, [rsp + 8]
lea rsi, [rsp + 24]
call std::common_type<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::operator-<long, std::ratio<1l, 1000l>, long, std::ratio<1l, 1000l> >(std::chrono::duration<long, std::ratio<1l, 1000l> > const&, std::chrono::duration<long, std::ratio<1l, 1000l> > const&)
mov qword ptr [rsp + 16], rax
lea rdi, [rsp + 16]
call std::chrono::duration<long, std::ratio<1l, 1000l> >::count() const
mov rcx, rax
mov edi, offset .L.str.1
mov edx, -1474836480
xor eax, eax
mov rsi, rcx
call printf
xor eax, eax
add rsp, 32
pop rbx
ret
.L.str:
.asciz "On step %d\n"
.L.str.1:
.asciz "Total time took: %ld to get at count: %d\n"
O2(速度)和O3(最大)优化本质上可以归结为展开的外循环(仅在print语句中存在)和预先计算的计数值。
这主要显示了Go生成的可怕代码以及C ++中发生的一些优化。但是,这些都没有确切显示Java字节码包含的内容,或者如果运行足够的次数,它将使JIT下降的原因。这是Java字节码:
public static void countToTwentyBillion();
Code:
0: lconst_0
1: lstore_0
2: invokestatic #2
// Method java/lang/System.currentTimeMillis:()J
5: lstore_2
6: iconst_0
7: istore
4
9: iload
4
11: bipush
10
13: if_icmpge
68
16: getstatic
#3
// Field java/lang/System.out:Ljava/io/PrintStream;
19: new
#4
// class java/lang/StringBuilder
22: dup
23: invokespecial #5
// Method java/lang/StringBuilder.'<init>':()V
26: ldc
#6
// String On step
28: invokevirtual #7
// Method java/lang/StringBuilder.append:(Ljava/lang/String;)Ljava/lang/StringBuilder;
31: iload
4
33: invokevirtual #8
// Method java/lang/StringBuilder.append:(I)Ljava/lang/StringBuilder;
36: invokevirtual #9
// Method java/lang/StringBuilder.toString:()Ljava/lang/String;
39: invokevirtual #10
// Method java/io/PrintStream.println:(Ljava/lang/String;)V
42: iconst_0
43: istore
5
45: iload
5
47: ldc
#11
// int 2000000000
49: if_icmpge
62
52: lload_0
53: lconst_1
54: ladd
55: lstore_0
56: iinc
5, 1
59: goto
45
62: iinc
4, 1
65: goto
9
68: invokestatic #2
// Method java/lang/System.currentTimeMillis:()J
71: lstore
4
73: getstatic
#3
// Field java/lang/System.out:Ljava/io/PrintStream;
76: new
#4
// class java/lang/StringBuilder
79: dup
80: invokespecial #5
// Method java/lang/StringBuilder.'<init>':()V
83: ldc
#12
// String Total time took:
85: invokevirtual #7
// Method java/lang/StringBuilder.append:(Ljava/lang/String;)Ljava/lang/StringBuilder;
88: lload
4
90: lload_2
91: lsub
92: invokevirtual #13
// Method java/lang/StringBuilder.append:(J)Ljava/lang/StringBuilder;
95: ldc
#14
// String ms to get at count:
97: invokevirtual #7
// Method java/lang/StringBuilder.append:(Ljava/lang/String;)Ljava/lang/StringBuilder;
100: lload_0
101: invokevirtual #13
// Method java/lang/StringBuilder.append:(J)Ljava/lang/StringBuilder;
104: invokevirtual #9
// Method java/lang/StringBuilder.toString:()Ljava/lang/String;
107: invokevirtual #10
// Method java/io/PrintStream.println:(Ljava/lang/String;)V
110: return
不幸的是,目前我不喜欢编译hsdis和JIT代码,但最终可能看起来像某些C ++示例。据我了解,JIT 可能可以预先计算计数值。但是,此代码有点复杂(就循环而言),这可能会使快速JIT优化变得更加困难。