Question

为什么store_idx_x86（）的汇编输出与store_idx（）和load_idx_x86（）的汇编输出与load_idx（）相同？

我的理解是__atomic_load_n（）将刷新核心的失效队列，而__atomic_store_n（）将刷新核心的存储缓冲区。

注 - 我遵守：gcc（GCC）4.8.2 20140120（Red Hat 4.8.2-16）

更新：据我所知，x86永远不会对其他商店重新订购商店，也不会加载其他商品 - 所以gcc足够聪明，只有在需要时才能实现sfence和lfence，或者应该在栅栏中使用__atomic_结果（假设存储器）模型比__ATOMIC_RELAXED更严格）？

代码

#include <stdint.h>


inline void store_idx_x86(uint64_t* dest, uint64_t idx)
{   
    *dest = idx;    
}

inline void store_idx(uint64_t* dest, uint64_t idx)
{
    __atomic_store_n(dest, idx, __ATOMIC_RELEASE);
}

inline uint64_t load_idx_x86(uint64_t* source)
{
    return *source;

}

inline uint64_t load_idx(uint64_t* source)
{
    return __atomic_load_n(source, __ATOMIC_ACQUIRE);
}

大会：

.file   "util.c"
    .text
    .globl  store_idx_x86
    .type   store_idx_x86, @function
store_idx_x86:
.LFB0:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    movq    %rdi, -8(%rbp)
    movq    %rsi, -16(%rbp)
    movq    -8(%rbp), %rax
    movq    -16(%rbp), %rdx
    movq    %rdx, (%rax)
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE0:
    .size   store_idx_x86, .-store_idx_x86
    .globl  store_idx
    .type   store_idx, @function
store_idx:
.LFB1:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    movq    %rdi, -8(%rbp)
    movq    %rsi, -16(%rbp)
    movq    -8(%rbp), %rax
    movq    -16(%rbp), %rdx
    movq    %rdx, (%rax)
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE1:
    .size   store_idx, .-store_idx
    .globl  load_idx_x86
    .type   load_idx_x86, @function
load_idx_x86:
.LFB2:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    movq    %rdi, -8(%rbp)
    movq    -8(%rbp), %rax
    movq    (%rax), %rax
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE2:
    .size   load_idx_x86, .-load_idx_x86
    .globl  load_idx
    .type   load_idx, @function
load_idx:
.LFB3:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    movq    %rdi, -8(%rbp)
    movq    -8(%rbp), %rax
    movq    (%rax), %rax
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE3:
    .size   load_idx, .-load_idx
    .ident  "GCC: (GNU) 4.8.2 20140120 (Red Hat 4.8.2-16)"
    .section    .note.GNU-stack,"",@progbits

Answer 1

为什么store_idx_x86（）的汇编输出与store_idx（）和load_idx_x86（）的汇编输出与load_idx（）相同？

在x86上，假设编译器强制对齐，它们是相同的操作。对原始大小或更小的对齐地址的加载和存储保证是原子的。参考Intel manual vol 3A, 8.1.1：

奔腾处理器（以及更新的处理器）保证了以下额外的内存操作将始终以原子方式执行：读取或写入在64位边界上对齐的四字[...]

此外，x86强制执行强排序内存模型，这意味着每个存储和加载都分别具有隐式发布和获取语义。

最后，只有在使用英特尔的非时间SSE指令（great reference here）或需要创建商店负载围栏（article here）时，才需要提及您的防护指令。（实际上那个是mfence或lock指令。

除此之外：我对英特尔手册中的陈述感到好奇，所以我devised a test program。令人沮丧的是，在我的电脑上（2核心i3-4030U），我得到了它的输出：

unaligned
4265292 / 303932066 | 1.40337%
unaligned, but in same cache line
2373 / 246957659 | 0.000960893%
aligned (8 byte)
0 / 247097496 | 0%

这似乎违反了英特尔的说法。我会调查。在此期间，您应该克隆该演示程序并查看它为您提供的内容。你只需要在linux上-std=c++11 ... -pthread。

原子加载和存储函数产生与非原子加载和存储相同的汇编代码

1 个答案: