Intel-64和ia32原子操作获取释放语义和GCC 5 +

时间:2018-06-02 14:14:46

标签: linux gcc x86-64 atomic futex

我正在研究Haswell CPU上的Intel CPU原子功能 (一个4/8核心2.3-3.9ghz i7-4790M),我发现它真的很难 构建例如。可靠的mutex_lock()和mutex_unlock() 例如GCC手册所建议的操作:

6.53 x86特定于事务性内存的内存模型扩展

x86架构支持额外的内存排序标记以进行标记 锁定硬件锁省略的关键部分。这些必须是 除了原子内在函数的现有内存模型之外还指定。

 Start lock elision on a lock variable.  Memory model must be
 '__ATOMIC_ACQUIRE' or stronger.
 End lock elision on a lock variable.  Memory model must be
 '__ATOMIC_RELEASE' or stronger.

当锁定获取失败时,需要良好的性能才能中止   交易很快。这可以通过' _mm_pause'

 #include <immintrin.h> // For _mm_pause

 int lockvar;

 /* Acquire lock with lock elision */
 while (__atomic_exchange_n(&lockvar, 1, 
     _mm_pause(); /* Abort failed transaction */
 /* Free lock with lock elision */
 __atomic_store_n(&lockvar, 0, __ATOMIC_RELEASE|__ATOMIC_HLE_RELEASE);

所以,阅读它和英特尔软件开发人员手册Vol.3部分 8.1,&#34;锁定原子操作&#34;,特别是8.1.4节, &#34; LOCK操作对内部处理器缓存的影响&#34;, 让我首先实现我的测试mutex_lock()mutex_unlock() 像:

... static inline attribute((always_inline,const)) bool ia64_has_clflush(void) { register unsigned int ebx=0; asm volatile ( "MOV $7, %%eax\n\t" "MOV $0, %%ecx\n\t" "CPUID\n\t" "MOV %%ebx, %0\n\t" : "=r" (ebx) : : "%eax", "%ecx", "%ebx" ); return ((ebx & (1U<<23)) ? true : false); }


static bool has_clflush=false;
void init_has_clflush(void)
{ has_clflush = ia64_has_clflush();
void init_has_clflush(void) __attribute__((constructor));

static inline __attribute__((always_inline))
void mutex_lock( register _Atomic int *ua )
{ // the SDM states that memory to be used as semaphores
  // should not be in the WB cache memory, but nearest we
  // can get to uncached memory is to explicitly un-cache it:
    asm volatile
    ( "CLFLUSHOPT (%0)"
      :: "r" (ua)
    // why isn't the cache flush enough?
      asm volatile
      ( "LFENCE" :: );
      register unsigned int x;
      x = __atomic_sub_fetch( ua, 1, _ACQ_SEQ_CST_);
      asm volatile
      ( "CLFLUSHOPT (%0)"
       :: "r" (ua)
      asm volatile
      ( "SFENCE" :: );
  while((x = __atomic_load_n(ua,_LD_SEQ_CST_)) != 0)
    switch(syscall( SYS_futex, ua, FUTEX_WAIT, x, nullptr,nullptr,0))
    {case 0:
     case -1:
      switch( errno )
      { case EINTR:
        case EAGAIN:
         fprintf(stderr,"Unexpected futex error: %d : '%s'.", errno,   

  static inline __attribute__((always_inline))
  void mutex_unlock( register _Atomic int *ua )
  { if(has_clflush)
      asm volatile
      ( "CLFLUSHOPT (%0)"
      :: "r" (ua)
      asm volatile( "LFENCE" :: );
    register unsigned int x;
    x = __atomic_add_fetch( ua, 1, _REL_SEQ_CST_);
      asm volatile
      ( "CLFLUSHOPT (%0)"
        :: "r" (ua)
      asm volatile ( "SFENCE" :: );
    if(x == 0)
      while( (1 < syscall( SYS_futex, ua, FUTEX_WAKE, 1,
           nullptr,nullptr,0)) && (errno == EINTR));

#define _LD_SEQ_CST_ __ATOMIC_SEQ_CST #define _ST_SEQ_CST_ __ATOMIC_SEQ_CST #define _ACQ_SEQ_CST_ (__ATOMIC_SEQ_CST|__ATOMIC_HLE_ACQUIRE) #define _REL_SEQ_CST_ (__ATOMIC_SEQ_CST|__ATOMIC_HLE_RELEASE) static bool has_clflush=false; static void init_has_clflush(void) { has_clflush = ia64_has_clflush(); } static void init_has_clflush(void) __attribute__((constructor)); static inline __attribute__((always_inline)) void mutex_lock( register _Atomic int *ua ) { // the SDM states that memory to be used as semaphores // should not be in the WB cache memory, but nearest we // can get to uncached memory is to explicitly un-cache it: if(has_clflush) asm volatile ( "CLFLUSHOPT (%0)" :: "r" (ua) ); // why isn't the cache flush enough? else asm volatile ( "LFENCE" :: ); register unsigned int x; x = __atomic_sub_fetch( ua, 1, _ACQ_SEQ_CST_); _mm_pause(); if(has_clflush) asm volatile ( "CLFLUSHOPT (%0)" :: "r" (ua) ); else asm volatile ( "SFENCE" :: ); while((x = __atomic_load_n(ua,_LD_SEQ_CST_)) != 0) switch(syscall( SYS_futex, ua, FUTEX_WAIT, x, nullptr,nullptr,0)) {case 0: break; case -1: switch( errno ) { case EINTR: case EAGAIN: continue; default: fprintf(stderr,"Unexpected futex error: %d : '%s'.", errno, strerror(errno)); return; } } } static inline __attribute__((always_inline)) void mutex_unlock( register _Atomic int *ua ) { if(has_clflush) asm volatile ( "CLFLUSHOPT (%0)" :: "r" (ua) ); else asm volatile( "LFENCE" :: ); register unsigned int x; x = __atomic_add_fetch( ua, 1, _REL_SEQ_CST_); _mm_pause(); if(has_clflush) asm volatile ( "CLFLUSHOPT (%0)" :: "r" (ua) ); else asm volatile ( "SFENCE" :: ); if(x == 0) while( (1 < syscall( SYS_futex, ua, FUTEX_WAKE, 1, nullptr,nullptr,0)) && (errno == EINTR)); }

现在,有趣的是关键的mutex_lock()减法和 mutex_unlock()添加操作最终作为指令:



# 61 "intel_lock1.c" 1
    CLFLUSHOPT (%rbx)
# 0 "" 2
    lock xacquire subl  $1, lck(%rip)
    rep nop
    cmpb    $0, has_clflush(%rip)
    je  .L8
# 72 "intel_lock1.c" 1
    CLFLUSHOPT (%rbx)
# 0 "" 2

但是这个实现似乎需要LFENCE / SFENCE 能够可靠地运行(CLFLUSHOPT是不够的),否则 这两个线程最终都会在futex()中死锁 锁定值是相同的-1。

我从阅读英特尔文档中看不到它是怎么回事 可能会发生两个线程进入指令 顺序:

# 98 "intel_lock1.c" 1
    CLFLUSHOPT (%rbx)
# 0 "" 2
    movl    $1, %eax
    lock xacquire xaddl %eax, lck(%rip)
    rep nop
    addl    $1, %eax
    cmpb    $0, has_clflush(%rip)
    je  .L25
# 109 "intel_lock1.c" 1
    CLFLUSHOPT (%rbx)
# 0 "" 2

最终都会得到结果&#39; -1&#39;在* lck如果* lck为0; 当然一个线程必须得-1,而另一个-2?


# %rbx == $lck
lock xacquire subl  $1, lck(%rip)
rep nop

任何Intel CPU Locking&amp;缓存专家在那里解释 如何以相同的未缓存位置* lck进行两次原子递减或递增 那两个 断言#LOCK总线信号(专用总线访问)和XACQUIRE 最终可以在* lck得到相同的结果吗?

我认为那是#LOCK前缀(和HLE)的意图吗? 我尝试过不使用HLE,只使用__ATOMIC_SEQ_CST进行所有访问, (这只是添加了LOCK前缀,而不是XACQUIRE),但没有区别 - 死锁仍然没有{L,S} FENCE-es。

我读过Ulrich Drepper的优秀论文[Futexes are Tricky]:,但他介绍了 一个仅将硬编码常量写入的互斥锁实现 锁存储器。我明白为什么。这很难 获得一个互斥锁,以便可靠地使用服务员计数或任何数量 锁定值的算术类型。 有没有人找到可靠的锁定算法的方法 这样结果适合于锁定/信号量 x86_64 Linux上的价值?最感兴趣的是讨论它们......

所以经过一些盲目的小巷调查HLE&amp; CLFLUSH, 锁定/解锁的唯一工作版本我已经能够 到达时使用硬编码常量和__atomic_compare_exchange_n - 测试程序的完整源代码,它会增加一个计数器 (没有锁定)直到+ /接收到退出信号, 在:



strace: Process 11978 attached with 2 threads
[pid 11979] futex(0x60209c, FUTEX_WAIT, 4294967295, NULL <unfinished ...>
[pid 11978] futex(0x60209c, FUTEX_WAIT, 4294967295, NULL^C



enum LockStatus

static inline __attribute__((always_inline))
bool mutex_lock( register _Atomic int *ua )
{ register int x;
  int cx;
  x  = __atomic_load_n( ua, _LD_SEQ_CST_ );
  cx = x;
  x = (x == UNLOCKED)
  if (! __atomic_compare_exchange_n
      ( ua, &cx, x, false, _ACQ_SEQ_CST_,  _ACQ_SEQ_CST_) )
    goto lock_superceded;
  if( x == LOCKED_ONE_WAITER )
  { do{
    switch(syscall( SYS_futex, ua, FUTEX_WAIT, x, nullptr,nullptr,0))
    {case 0:
     case -1:
      switch( errno )
      { case EINTR:
         return false;
        case EAGAIN:
          fprintf(stderr,"Unexpected futex WAIT error: %d : '%s'.",
                  errno, strerror(errno));
          return false;
    x = __atomic_load_n(ua,_LD_SEQ_CST_);
    } while(x < 0);
  return true;

static inline __attribute__((always_inline))
bool mutex_unlock( register _Atomic int *ua )
{ register int x;
  int cx;
  x  = __atomic_load_n( ua, _LD_SEQ_CST_ );
  cx = x;
  x = (x == LOCKED_ONE_WAITER)
       : UNLOCKED;
  if (! __atomic_compare_exchange_n
       ( ua, &cx, x, false, _ACQ_SEQ_CST_,  _ACQ_SEQ_CST_) )
    goto unlock_superceded;
    if(x == LOCKED_NO_WAITERS)
    { while((1 < 
             syscall( SYS_futex, ua, FUTEX_WAKE, 1, nullptr,nullptr,0))
         ||( UNLOCKED != __atomic_load_n( ua, _LD_SEQ_CST_ ))
         ) // we were a waiter, so wait for locker to unlock !
      { if( errno != 0 )
          {case EINTR:
            return false;
           case EAGAIN:
                  "Unexpected futex WAKE error: %d : '%s'.", 
                  errno, strerror(errno));
            return false;
   return true;

 Build & Test (GCC 7.3.1 & 6.4.1 & 5.4.0) used:
 $ gcc -std=gnu11 -march=x86-64 -mtune=native -D_REENTRANT \
   -pthread -Wall -Wextra -O3 -o intel_lock3 intel_lock3.c

 $ ./intel_lock3
 # wait a couple of seconds and press ^C

不应该打印&#34;被锁定了!&#34;在...内 打印时,几秒钟应该超过计数 最后,@ 5e8:5x10 ^ 8,而不是446.

使用strace运行表明两个线程正在阻塞 等待-1的锁定值变为0:

$ gcc -std=gnu11 -march=x86_64 -mtune=native -O3 -Wall -Wextra 
  -o intel_lock2 intel_lock2.c
$ ./intel_lock2
# wait a couple of seconds and press ^C
$ ./intel_lock2
^Cwas locked!


通常,应该在WAKE之前安排WAIT,但不知何故 GCC正在解释内存排序语义来表示 WAKE总是在任何WAIT之前安排;但即便如此 发生这种情况,代码应该被延迟,并且应该永远不会结束 两个线程在进入futex时获得-1 lck值(... FUTEX_WAIT ..)。

几乎相同的算法使用锁定值算法ALWAYS 当两个线程都得到(-1,-1)时死锁 - 注意,从未看到-2值 任何主题:

$ strace -f -e trace=futex ./intel_lock2
strace: Process 14481 attached
[pid 14480] futex(0x602098, FUTEX_WAIT, 4294967295, NULL <unfinished ...>
[pid 14481] futex(0x602098, FUTEX_WAKE, 1 <unfinished ...>
[pid 14480] <... futex resumed> )       = -1 EAGAIN (Resource temporarily
[pid 14481] <... futex resumed> )       = 0
[pid 14480] futex(0x602098, FUTEX_WAKE, 1 <unfinished ...>
[pid 14481] futex(0x602098, FUTEX_WAIT, 4294967295, NULL <unfinished ...>
[pid 14480] <... futex resumed> )       = 0
[pid 14481] <... futex resumed> )       = -1 EAGAIN (Resource temporarily
[pid 14480] futex(0x602098, FUTEX_WAIT, 4294967295, NULL <unfinished ...>
[pid 14481] futex(0x602098, FUTEX_WAIT, 4294967295, NULL^C <unfinished  
[pid 14480] <... futex resumed> )       = ? ERESTARTSYS (To be restarted 
if SA_RESTART is set)
strace: Process 14480 detached
strace: Process 14481 detached
was locked!

所以,我认为如果算术运算起作用 预期的,即。序列化和原子,然后上面 代码不会死锁;算术应该生成 与使用的LockStatus枚举值相同的数字 工作实例。

但是现在产生的算法出了问题 说明:


static inline __attribute__((always_inline))
bool mutex_lock( register _Atomic volatile int *ua )
{ register int x;
  x = __atomic_add_fetch( ua, -1, _ACQ_SEQ_);
  if( x < 0 )
  { do{
    // here you can put:
    // if( x == -2) { .. NEVER REACHED! }
    switch(syscall( SYS_futex, ua, FUTEX_WAIT, x, nullptr,nullptr,0))
    {case 0:
     case -1:
      switch( errno )
      { case EINTR:
         return false; // interrupted - user wants to exit?
        case EAGAIN:
          fprintf(stderr,"Unexpected futex WAIT error: %d : '%s'.",
                  errno, strerror(errno));
          return false;
    x = __atomic_load_n(ua,_LD_SEQ_);
    } while(x < 0);
  return true;

static inline __attribute__((always_inline))
bool mutex_unlock( register _Atomic volatile int *ua )
{ register int x;
  x = __atomic_add_fetch( ua, 1, _REL_SEQ_);
  if(x == 0) // there was ONE waiter
     while(  (1 < 
             syscall( SYS_futex, ua, FUTEX_WAKE, 1, nullptr,nullptr,0)
           ||(1 < __atomic_load_n(ua, _LD_SEQ_)
             ) // wait for first locker to unlock
     { if( errno != 0 )
         {case EINTR:
           return false;
          case EAGAIN:
           fprintf(stderr,"Unexpected futex WAKE error: %d : '%s'.", 
                  errno, strerror(errno));
           return false;
     return true;


movl    $-1, %eax
lock xaddl  %eax, (%rdx)


AFAICS,没有有效的代码表 在两个线程中获得相同的-1值。

所以我的结论是在算术上使用intel LOCK前缀 说明是不安全的,并在用户模式中引入了错误的行为 Linux x86_64 gcc编译程序 - 仅限 从文本存储器到数据存储器的常量值写入是 原子序并在Intel Haswell i7-4790M平台上按顺序订购 与gcc&amp; Linux中, 并且这些平台上的算术不能成为原子和算法。通过使用任何组合顺序排序 HLE / XACQUIRE,锁定前缀或FENCE指令。

我的预感是分支预测在某种程度上失败了 添加额外的算术运算/无法执行 在此平台上的算术运算,LOCK前缀被断言 和不同物理核心上的多个线程。 因此,所有带LOCK前缀的算术运算都被置位 是可疑的,应该避免。

2 个答案:

答案 0 :(得分:2)


lock subl $1, (%rdi)lock xaddl %eax, (%rdx)都是100%原子,即使指针未对齐(但在这种情况下速度要慢得多),并且是完整的内存屏障。在可缓存的内存中,不会有任何外部#LOCK总线信号;内部实现只是在运行lock ed指令的内核中将高速缓存行锁定在MESI的M状态中。有关详细信息,请参阅Can num++ be atomic for 'int num'?

如果您的测试发现它不是原子的,那么您的硬件就会损坏或您的测试被破坏。找到一个死锁告诉你你的设计中有一个错误,而不是你的原子原始构建块不是原子的。通过使用两个线程递增共享计数器,您可以非常轻松地测试原子增量,并注意不会丢失任何计数。与使用addl $1, shared(%rip)而不使用lock的情况不同,您会看到丢失的数量。

此外,lfencesfencepause对正常情况下的正确性没有影响(没有NT存储,只使用WB(回写)内存)。如果你的任何fence / clflush的东西有帮助,那只能在某个地方添加一个额外的延迟,这可能会让你的测试中的线程总是失去竞争,而不是实际上让它变得安全。 mfence是唯一重要的围栏,阻止StoreLoad重新排序和存储转发效果。 (这就是为什么gcc使用它作为实现seq-cst商店的一部分)。

在考虑使用HLE /事务性内存之前,先获取一个基本版本。


x = __atomic_sub_fetch( ua, 1, _ACQ_SEQ_CST_);是原子的,只有一个帖子的lock sub可以将ua0更改为-1并从x=-1 获取有

但是你没有使用sub_fetch结果,你正在进行另一次加载 while((x = __atomic_load_n(ua,_LD_SEQ_CST_)) != 0)

因此,如果第一个线程锁定然后在ua=-1和第二个线程中的负载之间解锁,则另一个线程可以看到lock sub

它被称为sub_fetch的原因是它以原子方式返回旧值,并以原子方式修改内存中的值。您放弃sub_fetch结果的事实是它可以编译为lock sub的原因,而不是lock xadd并且注册持有-1的原因。

(或者智能编译器可以将其编译为lock sub并检查ZF,因为您可以通过lock sub设置的标志来判断值何时变为非零或负值。)

请参阅C & low-level semaphore implementation了解一个简单的信号量,没有回退到OS辅助的睡眠/唤醒。它会在加载时旋转,直到我们看到一个大于0的值,然后尝试使用C11 fetch_add(-1)进行锁定。


这可能是一个糟糕的设计;最好用lock cmpxchg尝试递减,因此失败的线程不必撤消它们的减量。


您不需要SFENCE,LFENCE或CLFLUSH [OPT]或其他任何东西。在任何内存类型(包括WB)上,lock xadd已经是一个完整的内存屏障和100%原子。




x = __atomic_sub_fetch( ua, 1, _ACQ_SEQ_CST_);  // ok, fine
_mm_pause();   // you don't want a pause on the fast path.

if( x < 0 )   // just make this a while(x<0) loop
do {
   futex(..., FUTEX_WAIT, ...);

   x = __atomic_load_n(ua,_LD_SEQ_CST_);        // races with lock sub in other threads.
} while(x < 0);

给定主题A在futex中与lck == -1一起睡觉(如果可能的话?):

  • 主题B解锁,产生lck == 0,并调用futex(FUTEX_WAKE)
  • 线程A醒来,当lck仍为0
  • 时,futex返回
  • 其他一些帖子(B或第三个帖子)进入mutex_lock并运行__atomic_sub_fetch( ua, 1, _ACQ_SEQ_CST_);,离开lck == -1
  • 主题A在其循环底部运行x = __atomic_load_n(ua,_LD_SEQ_CST_);并看到-1







答案 1 :(得分:-2)




现在有一个版本可以保持准确的负面服务员 count,使用锁定算术,在:


unlock_mutex()例程,IFF有服务员,必须等待每一个 现有的服务员要解锁,这样当它返回时,互斥锁就是 解锁,没有服务员。它可以通过实现这一点 spin-locking + sched_yield()等待锁定值变为1, 或者它可以使用另一个futex调用。所以原来的储物柜,当它 进入mutex_unlock(),负责确保每一个 现有服务员醒来并解锁互斥锁。



但仍有奇怪之处:如果任何一个进程都是ptrace-ed()by   strace或用'-g3'而不是'-O3'编译,它现在经历   '不一致' -   即。不一致的临界区修改值。这不会发生   如果程序不是ptrace -d并使用-O3编译。

见下面的讨论。为了GCC的内置__atomic*功能 要工作,必须使用任何-O$x标志调用GCC的优化阶段 在编译期间指定足以实现正确的操作 __atomic*内置的。{/ p>

mutex_lock()/ unlock例程的最佳版本:

static inline __attribute__((always_inline))
bool mutex_lock( register _Atomic volatile int *ua )
// lock the mutex value pointed to by 'ua';
// can return false if operation was interrupted ( a signal received ).
{ register int x;
  // lock_again:
  x = __atomic_add_fetch( ua, -1, _ACQ_SEQ_);
  while( x < 0 )
  { switch(syscall( SYS_futex, ua, FUTEX_WAIT, x, nullptr,nullptr,0))
    {case 0:
     case -1:
      switch( errno )
      { case EINTR:
         return false;
        case EAGAIN:
          // this has never been observed to happen, but in any 
          // production implementation
          // should be replaced by some kind of 
          // 'throw( exception )' statement:
          fprintf(stderr,"Unexpected futex WAIT error: %d : '%s'.",
                  errno, strerror(errno));
          return false;
    x = __atomic_load_n(ua,_LD_SEQ_);
  return true;

static inline __attribute__((always_inline))
bool mutex_unlock( register _Atomic volatile int *ua )
// unlock: returns false only if interrupted, else returns true
// only when the mutex pointed to by *ua has been unlocked and 
// has no waiters.
#ifdef _WITH_UWAIT_
  static int has_unlock_waiter = 0;
  register int x;
  x = __atomic_add_fetch( ua, 1, _REL_SEQ_);
  if(x < 1) // there was at least ONE waiter, 
            // so we are the original locker
  { while(1 < syscall( SYS_futex, ua, FUTEX_WAKE, 1, nullptr,nullptr,0))
    { if( errno != 0 )
        {case EINTR:
          return false;
         case EAGAIN:
           // never observed to happen - should be a throw()
          fprintf(stderr,"Unexpected futex WAKE error: %d : '%s'.", 
                  errno, strerror(errno));
          return false;
#ifdef _WITH_UWAIT_
// this is strictly unnecessary, and can be replaced by use of
// sched_yield() (see below), but it
// makes the situation clearer:
// unlock :
    // so we have woken a waiter; wait for that waiter to 
    // actually unlock before returning -
    // by definition, when that waiter enters mutex_unlock() 
    // (AND IT MUST!!), it will not
    // enter the clause containing this code unless there is more than
    // one other waiter., in which case we want to continue until there
    // are no waiters.
    while(1 > (x = __atomic_load_n( ua, _LD_SEQ_ )))
    { __atomic_store_n(&has_unlock_waiter, 1, _ST_SEQ_);
      if( (-1 == 
          syscall( SYS_futex, ua, FUTEX_WAIT, x, nullptr,nullptr,0)
          ) && (errno == EINTR)
        ) return false;
    if( __atomic_load_n(&has_unlock_waiter, _ST_SEQ_) )
      __atomic_store_n(&has_unlock_waiter, 0, _ST_SEQ_);
// The same result is actually achieved by this loop:
    while(1 > (x = __atomic_load_n(ua, _LD_SEQ_)))
    // we do need to wait for the waiting locker to unlock 
    // before proceeding, else
    // mutex_lock could be reentered with lck < 0 and deadlock 
    // would result.
#ifdef _WITH_UWAIT_
  }else if( (x==1) && __atomic_load_n(&has_unlock_waiter, _ST_SEQ_) )
  { // so we're the waiter that a previous unlock woke up 
    // and is waiting for - it now needs to be woken:
    while(1 < syscall( SYS_futex, ua, FUTEX_WAKE, 1, nullptr,nullptr,0))
    { if( errno != 0 )
        {case EINTR:  // no, we cannot let user try to unlock again, since modification of lock value succeeded.
         case EAGAIN:
          fprintf(stderr,"Unexpected futex WAKE error: %d : '%s'.", errno, strerror(errno));
          return false;
  return true;

static inline __attribute__((always_inline)) bool mutex_lock( register _Atomic volatile int *ua ) // lock the mutex value pointed to by 'ua'; // can return false if operation was interrupted ( a signal received ). { register int x; // lock_again: x = __atomic_add_fetch( ua, -1, _ACQ_SEQ_); while( x < 0 ) { switch(syscall( SYS_futex, ua, FUTEX_WAIT, x, nullptr,nullptr,0)) {case 0: break; case -1: switch( errno ) { case EINTR: return false; case EAGAIN: break; default: // this has never been observed to happen, but in any // production implementation // should be replaced by some kind of // 'throw( exception )' statement: fprintf(stderr,"Unexpected futex WAIT error: %d : '%s'.", errno, strerror(errno)); return false; } } x = __atomic_load_n(ua,_LD_SEQ_); } return true; } static inline __attribute__((always_inline)) bool mutex_unlock( register _Atomic volatile int *ua ) // unlock: returns false only if interrupted, else returns true // only when the mutex pointed to by *ua has been unlocked and // has no waiters. { #ifdef _WITH_UWAIT_ static int has_unlock_waiter = 0; #endif register int x; x = __atomic_add_fetch( ua, 1, _REL_SEQ_); if(x < 1) // there was at least ONE waiter, // so we are the original locker { while(1 < syscall( SYS_futex, ua, FUTEX_WAKE, 1, nullptr,nullptr,0)) { if( errno != 0 ) switch(errno) {case EINTR: return false; case EAGAIN: break; default: // never observed to happen - should be a throw() fprintf(stderr,"Unexpected futex WAKE error: %d : '%s'.", errno, strerror(errno)); return false; } } #ifdef _WITH_UWAIT_ // this is strictly unnecessary, and can be replaced by use of // sched_yield() (see below), but it // makes the situation clearer: // unlock : // so we have woken a waiter; wait for that waiter to // actually unlock before returning - // by definition, when that waiter enters mutex_unlock() // (AND IT MUST!!), it will not // enter the clause containing this code unless there is more than // one other waiter., in which case we want to continue until there // are no waiters. while(1 > (x = __atomic_load_n( ua, _LD_SEQ_ ))) { __atomic_store_n(&has_unlock_waiter, 1, _ST_SEQ_); if( (-1 == syscall( SYS_futex, ua, FUTEX_WAIT, x, nullptr,nullptr,0) ) && (errno == EINTR) ) return false; } if( __atomic_load_n(&has_unlock_waiter, _ST_SEQ_) ) __atomic_store_n(&has_unlock_waiter, 0, _ST_SEQ_); #else // The same result is actually achieved by this loop: while(1 > (x = __atomic_load_n(ua, _LD_SEQ_))) sched_yield(); #endif // we do need to wait for the waiting locker to unlock // before proceeding, else // mutex_lock could be reentered with lck < 0 and deadlock // would result. #ifdef _WITH_UWAIT_ }else if( (x==1) && __atomic_load_n(&has_unlock_waiter, _ST_SEQ_) ) { // so we're the waiter that a previous unlock woke up // and is waiting for - it now needs to be woken: while(1 < syscall( SYS_futex, ua, FUTEX_WAKE, 1, nullptr,nullptr,0)) { if( errno != 0 ) switch(errno) {case EINTR: // no, we cannot let user try to unlock again, since modification of lock value succeeded. case EAGAIN: break; default: fprintf(stderr,"Unexpected futex WAKE error: %d : '%s'.", errno, strerror(errno)); return false; } } } #else } #endif return true; }


('^ C'表示同时按+键。)


$ gcc -std=gnu11 -pthread -D_WITH_UWAIT_ -O3 -o il2 il2.c
$ ./il2
$ gcc -std=gnu11 -pthread -O3 -o il2 il2.c
$ ./il2

我试图对'-g'(仅)编译版本进行扫描并获得不一致 - 如果也使用任何'-O'标志,则不会发生这种情况。