Question

我决定用不同的锁定策略测量增量，并使用JMH来实现此目的。我正在使用JMH来检查吞吐量和平均时间以及用于检查正确性的简单自定义测试。有六种策略：

原子计数
ReadWrite锁定计数
与volatile同步
同步块没有易失性
sun.misc.Unsafe.compareAndSwap
sun.misc.Unsafe.getAndAdd
取消同步计数

基准代码：

@State(Scope.Benchmark)
@BenchmarkMode({Mode.Throughput, Mode.AverageTime})
@OutputTimeUnit(TimeUnit.MICROSECONDS)
@Fork(1)
@Warmup(iterations = 5)
@Measurement(iterations = 5)
public class UnsafeCounter_Benchmark {
    public Counter unsync, syncNoV, syncV, lock, atomic, unsafe, unsafeGA;

    @Setup(Level.Iteration)
    public void prepare() {
        unsync = new UnsyncCounter();
        syncNoV = new SyncNoVolatileCounter();
        syncV = new SyncVolatileCounter();
        lock = new LockCounter();
        atomic = new AtomicCounter();
        unsafe = new UnsafeCASCounter();
        unsafeGA = new UnsafeGACounter();
    }

    @Benchmark
    public void unsyncCount() {
        unsyncCounter();
    }

    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void unsyncCounter() {
        unsync.increment();
    }

    @Benchmark
    public void syncNoVCount() {
        syncNoVCounter();
    }

    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void syncNoVCounter() {
        syncNoV.increment();
    }

    @Benchmark
    public void syncVCount() {
        syncVCounter();
    }

    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void syncVCounter() {
        syncV.increment();
    }

    @Benchmark
    public void lockCount() {
        lockCounter();
    }

    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void lockCounter() {
        lock.increment();
    }

    @Benchmark
    public void atomicCount() {
        atomicCounter();
    }

    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void atomicCounter() {
        atomic.increment();
    }

    @Benchmark
    public void unsafeCount() {
        unsafeCounter();
    }

    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void unsafeCounter() {
        unsafe.increment();
    }

    @Benchmark
    public void unsafeGACount() {
        unsafeGACounter();
    }

    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void unsafeGACounter() {
        unsafeGA.increment();
    }

    public static void main(String[] args) throws RunnerException {
        Options baseOpts = new OptionsBuilder()
                .include(UnsafeCounter_Benchmark.class.getSimpleName())
                .threads(100)
                .jvmArgs("-ea")
                .build();

        new Runner(baseOpts).run();
    }
}

替补席的结果：

JDK 8u20

Benchmark                                         Mode  Samples   Score    Error   Units
o.k.u.u.UnsafeCounter_Benchmark.atomicCount      thrpt        5  42.178 ± 17.643  ops/us
o.k.u.u.UnsafeCounter_Benchmark.lockCount        thrpt        5  24.044 ±  2.264  ops/us
o.k.u.u.UnsafeCounter_Benchmark.syncNoVCount     thrpt        5  22.849 ±  1.344  ops/us
o.k.u.u.UnsafeCounter_Benchmark.syncVCount       thrpt        5  20.235 ±  2.027  ops/us
o.k.u.u.UnsafeCounter_Benchmark.unsafeCount      thrpt        5  12.460 ±  1.326  ops/us
o.k.u.u.UnsafeCounter_Benchmark.unsafeGACount    thrpt        5  39.106 ±  2.966  ops/us
o.k.u.u.UnsafeCounter_Benchmark.unsyncCount      thrpt        5  93.076 ±  9.674  ops/us
o.k.u.u.UnsafeCounter_Benchmark.atomicCount       avgt        5   2.604 ±  0.133   us/op
o.k.u.u.UnsafeCounter_Benchmark.lockCount         avgt        5   4.161 ±  0.546   us/op
o.k.u.u.UnsafeCounter_Benchmark.syncNoVCount      avgt        5   4.440 ±  0.523   us/op
o.k.u.u.UnsafeCounter_Benchmark.syncVCount        avgt        5   5.073 ±  0.439   us/op
o.k.u.u.UnsafeCounter_Benchmark.unsafeCount       avgt        5   9.088 ±  5.964   us/op
o.k.u.u.UnsafeCounter_Benchmark.unsafeGACount     avgt        5   2.611 ±  0.164   us/op
o.k.u.u.UnsafeCounter_Benchmark.unsyncCount       avgt        5   1.047 ±  0.050   us/op

我期望的大部分衡量标准，UnsafeCounter_Benchmark.unsafeCount使用sun.misc.Unsafe.compareAndSwapLong while循环除外。这是最慢的锁定。

public void increment() {
    long before = counter;
    while (!unsafe.compareAndSwapLong(this, offset, before, before + 1L)) {
        before = counter;
    }
}

我认为低性能是因为while循环和JMH产生更高的争用，但是当我通过Executors检查了正确性时，我得到了我期望的数字：

Counter result: UnsyncCounter 97538676
Time passed in ms:259
Counter result: AtomicCounter 100000000
Time passed in ms:1805
Counter result: LockCounter 100000000
Time passed in ms:3904
Counter result: SyncNoVolatileCounter 100000000
Time passed in ms:14227
Counter result: SyncVolatileCounter 100000000
Time passed in ms:19224
Counter result: UnsafeCASCounter 100000000
Time passed in ms:8077
Counter result: UnsafeGACounter 100000000
Time passed in ms:2549

正确性测试代码：

public class UnsafeCounter_Test {
    static class CounterClient implements Runnable {
        private Counter c;
        private int num;

        public CounterClient(Counter c, int num) {
            this.c = c;
            this.num = num;
        }

        @Override
        public void run() {
            for (int i = 0; i < num; i++) {
                c.increment();
            }
        }
    }

    public static void makeTest(Counter counter) throws InterruptedException {
        int NUM_OF_THREADS = 1000;
        int NUM_OF_INCREMENTS = 100000;
        ExecutorService service = Executors.newFixedThreadPool(NUM_OF_THREADS);
        long before = System.currentTimeMillis();
        for (int i = 0; i < NUM_OF_THREADS; i++) {
            service.submit(new CounterClient(counter, NUM_OF_INCREMENTS));
        }
        service.shutdown();
        service.awaitTermination(1, TimeUnit.MINUTES);
        long after = System.currentTimeMillis();
        System.out.println("Counter result: " + counter.getClass().getSimpleName() + " " + counter.getCounter());
        System.out.println("Time passed in ms:" + (after - before));
    }

    public static void main(String[] args) throws InterruptedException {
        makeTest(new UnsyncCounter());
        makeTest(new AtomicCounter());
        makeTest(new LockCounter());
        makeTest(new SyncNoVolatileCounter());
        makeTest(new SyncVolatileCounter());
        makeTest(new UnsafeCASCounter());
        makeTest(new UnsafeGACounter());
    }
}

我知道这是非常糟糕的测试，但在这种情况下，Unsafe CAS比Sync变体快两倍，一切都按预期进行。有人可以澄清描述的行为吗？有关详细信息，请参阅GitHub回购：Bench，Unsafe CAS counter

Answer 1

大声思考：人们常常做90％的繁琐工作，并为其他人留下10％（乐趣开始的地方）！好的，我带走了所有的乐趣！

让我先在i7-4790K，8u40 EA上重复实验：

Benchmark                                 Mode  Samples    Score    Error   Units
UnsafeCounter_Benchmark.atomicCount      thrpt        5   47.669 ± 18.440  ops/us
UnsafeCounter_Benchmark.lockCount        thrpt        5   14.497 ±  7.815  ops/us
UnsafeCounter_Benchmark.syncNoVCount     thrpt        5   11.618 ±  2.130  ops/us
UnsafeCounter_Benchmark.syncVCount       thrpt        5   11.337 ±  4.532  ops/us
UnsafeCounter_Benchmark.unsafeCount      thrpt        5    7.452 ±  1.042  ops/us
UnsafeCounter_Benchmark.unsafeGACount    thrpt        5   43.332 ±  3.435  ops/us
UnsafeCounter_Benchmark.unsyncCount      thrpt        5  102.773 ± 11.943  ops/us

真的，关于unsafeCount测试似乎有点可疑。实际上，在验证之前，您必须假设所有数据都是可疑的。对于nanobenchmarks，您必须验证生成的代码，以查看您是否实际测量了要测量的内容。在JMH中，使用-prof perfasm 非常快速可行。事实上，如果你看一下unsafeCount最热门的地区，你会发现一些有趣的事情：

  0.12%    0.04%    0x00007fb45518e7d1: mov    0x10(%r10),%rax    
 17.03%   23.44%    0x00007fb45518e7d5: test   %eax,0x17318825(%rip)
  0.21%    0.07%    0x00007fb45518e7db: mov    0x18(%r10),%r11    ; getfield offset
 30.33%   10.77%    0x00007fb45518e7df: mov    %rax,%r8
  0.00%             0x00007fb45518e7e2: add    $0x1,%r8           
  0.01%             0x00007fb45518e7e6: cmp    0xc(%r10),%r12d    ; typecheck 
                    0x00007fb45518e7ea: je     0x00007fb45518e80b ; bail to v-call
  0.83%    0.48%    0x00007fb45518e7ec: lock cmpxchg %r8,(%r10,%r11,1)
 33.27%   25.52%    0x00007fb45518e7f2: sete   %r8b
  0.12%    0.01%    0x00007fb45518e7f6: movzbl %r8b,%r8d          
  0.03%    0.04%    0x00007fb45518e7fa: test   %r8d,%r8d
                    0x00007fb45518e7fd: je     0x00007fb45518e7d1 ; back branch

翻译：a）offset字段在每次迭代时被重新读取 - 因为CAS记忆效应意味着易失性读取，因此该字段需要被悲观地重新读取; b）出于同样的原因，热闹的部分是unsafe字段 被重新读取以进行类型检查。

这就是为什么高性能代码应如下所示：

--- a/utils bench/src/main/java/org/kirmit/utils/unsafe/concurrency/UnsafeCASCounter.java +++ b/utils bench/src/main/java/org/kirmit/utils/unsafe/concurrency/UnsafeCASCounter.java @@ -5,13 +5,13 @@ import sun.misc.Unsafe; public class UnsafeCASCounter implements Counter { private volatile long counter = 0; - private final Unsafe unsafe = UnsafeHelper.unsafe; - private long offset; - { + private static final Unsafe unsafe = UnsafeHelper.unsafe; + private static final long offset; + static { try { offset = unsafe.objectFieldOffset(UnsafeCASCounter.class.getDeclaredField("counter")); } catch (NoSuchFieldException e) { - e.printStackTrace(); + throw new IllegalStateException("Whoops!"); } }

如果你这样做，unsafeCount性能会提升：

Benchmark Mode Samples Score Error Units UnsafeCounter_Benchmark.unsafeCount thrpt 5 9.733 ± 0.673 ops/us

...考虑到错误界限，它现在非常接近同步测试。如果您现在查看-prof perfasm，这是一个unsafeCount循环：

0.08% 0.02% 0x00007f7575191900: mov 0x10(%r10),%rax 28.09% 28.64% 0x00007f7575191904: test %eax,0x161286f6(%rip) 0.23% 0.08% 0x00007f757519190a: mov %rax,%r11 0x00007f757519190d: add $0x1,%r11 0x00007f7575191911: lock cmpxchg %r11,0x10(%r10) 47.27% 23.48% 0x00007f7575191917: sete %r8b 0.10% 0x00007f757519191b: movzbl %r8b,%r8d 0.02% 0x00007f757519191f: test %r8d,%r8d 0x00007f7575191922: je 0x00007f7575191900

这个循环非常紧张，似乎没有什么可以让它变得更快。我们花了大部分时间来加载＆＃34;更新＆＃34;价值和实际CAS化它。但我们争吵很多！为了确定争用是否是主要原因，让我们添加退避：

--- a/utils bench/src/main/java/org/kirmit/utils/unsafe/concurrency/UnsafeCASCounter.java +++ b/utils bench/src/main/java/org/kirmit/utils/unsafe/concurrency/UnsafeCASCounter.java @@ -20,6 +21,7 @@ public class UnsafeCASCounter implements Counter { long before = counter; while (!unsafe.compareAndSwapLong(this, offset, before, before + 1L)) { before = counter; + Blackhole.consumeCPU(1000); } }

...运行：

Benchmark Mode Samples Score Error Units UnsafeCounter_Benchmark.unsafeCount thrpt 5 99.869 ± 107.933 ops/us

瞧。我们在循环中进行更多工作，但它使我们免于竞争。我试着在"Nanotrusting the Nanotime"之前解释这个问题，回到那里阅读更多基准测试方法可能会很好，特别是在测量重量级操作时。这突出了整个实验中的陷阱，不仅仅是unsafeCount。

为OP和感兴趣的读者练习：解释为什么unsafeGACount和atomicCount的执行速度比其他测试快得多。你现在有了这些工具。

P.S。在具有C（C
P.P.S。时间检查：10分钟进行分析和其他实验，20分钟即可完成。你浪费了多少时间手工复制结果？ ;）

通过JMH测量sun.misc.Unsafe.compareAndSwap中的奇怪行为

1 个答案: