Question

显然，可以使用比较和交换指令以原子方式递增两个整数。 This talk声称存在这样的算法，但它没有详细说明它的样子。

如何做到这一点？

（注意，一个接一个地递增整数的明显解决方案不是原子的。另外，将多个整数填充到一个机器字中并不算数，因为它会限制可能的范围。）

Answer 1

让我想起序列锁定。不是很准确（从记忆中提出这个）但是有些东西：

让x，y和s为64位整数。

增加：

memory barrier atomic x++ atomic y++ atomic s++ memory barrier（我的意思是使用64位CAS op的原子增量）

do {
    S1 = load s
    X = load x
    Y = load y
    memory barrier
    S2 = load s
} while (S1 != S2)

阅读：

{{1}}

另见https://en.wikipedia.org/wiki/Seqlock

Answer 2

如果sse2可用，您可以使用 paddq 在一条指令中将2个64位整数添加到另外两个64位整数。

#include "emmintrin.h"
//initialize your values somewhere:
//const __m128i ones = _mm_set1_epi64x(1);
//volatile register __m128i vars = 
//    _mm_set_epi64x(24,7);
static inline __m128i inc_both(__m128i vars, __m128i ones){
  return _mm_add_epi64(vars,ones);
}

这应编译为

    paddq  %xmm0, %xmm1

由于它是静态内联的，它可能会使用其他xmm寄存器。如果存在显着的寄存器压力，则操作数可能变为 1（℅rip）

注意：这可以用于添加1以外的值，并且对于大多数其他数学，按位和比较指令，如果需要，它们也有类似的操作。

因此，您可以使用锁定前缀并将其设置为内联asm宏

#define inc64x2(vars) asm volatile( \
    "paddq %0, %1\n":"+x"(vars):"x"(ones) \
  );

手臂霓虹灯等效物类似于：vaddq_s64（...），但有一篇关于arm / x86等效物here的文章很棒。

Answer 3

我已经测试了一个解决方案。这里包含的是坚果概念证明程序。

算法是＆＃34;使用CAS线程ID门＆＃34;作为第3个整数。我看了两次视频谈话，我相信这是合格的。它可能不是演示者想到的算法，但确实有效。

X和Y值可以在内存中的任何位置，程序将它们放置在远离彼此的位置，使它们位于不同的缓存行上。它并不重要。

算法的简要说明：

每个帖子都有一个unique id number或tid（非零），来自一个最喜欢的来源：pthead_t，getpid，{{1} }，gettid。在程序中，它只是从1开始按顺序分配它们。

每个线程都会用这个数字调用increment函数。

增量函数将使用旧的make one up by whatever means you want变量旋转，旧值为0，新值为gate。

当CAS成功时，线程现在拥有＆＃34;的东西。换句话说，如果tid为零，则可以抓取它。非零值是所有者的gate，并且tid已被锁定。

现在，所有者可以使用简单gate和X自由增加Y和x += 1值。

之后，增量函数通过在y += 1中存储0来释放。

这是一个包含所有内容的诊断/概念验证程序。算法本身没有限制，但我为我的机器编码。

一些警告：

它假定为gate / gcc
它采用64位clang拱门。
只使用内联asm进行编码，并且需要 no [也不使用任何]编译器x86_64支持，以提高清晰度，简洁性和透明度。
这是在linux下构建的，但是应该适用于任何＆＃34;合理的＆＃34; x86机器/操作系统（例如BSD，OSX应该没问题，可能是cygwin，也许是mingw）
其他拱门如果支持CAS就没问题，我只是没有为它们编码（例如，atomic如果您使用arm对编码CAS可能会有效）
有足够的抽象基元，这将是/应该是容易的。
没有尝试Windows兼容性[如果你想要它，做你自己的端口，但不要让我流泪 - 或评论： - ）]。
makefile和程序已默认为最佳值
某些x86 CPU可能需要使用不同的默认值（例如需要fence指令）。请参阅makefile。

无论如何，这是：

ldex/stex

这是Makefile。对不起，额外的样板：

// caslock -- prove cas lock algorithm

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>

#define systls              __thread

// repeat the madness only once
#ifdef __clang__
#define inline_common       inline
#else
#define inline_common       static inline
#endif
#define inline_always       inline_common __attribute__((__always_inline__))
#define inline_never        __attribute__((__noinline__))

// WARNING: inline CAS fails for gcc but works for clang!
#if _USE_CASINLINE_
#define inline_cas          inline_always
#else
#define inline_cas          inline_never
#endif

typedef unsigned int u32;
typedef unsigned long long u64;

#ifndef LOOPMAX
#define LOOPMAX             1000000
#endif

#ifndef TIDMAX
#define TIDMAX              20
#endif

#if _USE_VPTR_
typedef volatile u32 *xptr32_p;
typedef volatile u64 *xptr64_p;
#else
typedef u32 *xptr32_p;
typedef u64 *xptr64_p;
#endif

#if _USE_TID64_
typedef u64 tid_t;
#define tidload(_xptr)                  loadu64(_xptr)
#define tidcas(_xptr,_oval,_nval)       casu64(_xptr,_oval,_nval)
#define tidstore(_xptr,_nval)           storeu64(_xptr,_nval)
#else
typedef u32 tid_t;
#define tidload(_xptr)                  loadu32(_xptr)
#define tidcas(_xptr,_oval,_nval)       casu32(_xptr,_oval,_nval)
#define tidstore(_xptr,_nval)           storeu32(_xptr,_nval)
#endif
tid_t tidgate;                          // gate control
tid_t readycnt;                         // number of threads ready
tid_t donecnt;                          // number of threads complete

// ensure that the variables are nowhere near each other
u64 ary[100];
#define kickoff     ary[32]             // sync to fire threads
#define xval        ary[31]             // the X value
#define yval        ary[87]             // the Y value

int inctype;                            // increment algorithm to use
tid_t tidmax;                           // maximum number of tasks
u64 loopmax;                            // loop maximum for each task

// task control
struct tsk {
    tid_t tsk_tid;                      // task id
    u32 tsk_casmiss;                    // cas miss count
};
typedef struct tsk tsk_t;

tsk_t *tsklist;                         // task list
systls tsk_t *tskcur;                   // current task block

// show progress
#define PGR(_pgr) \
    do { \
        fputs(_pgr,stdout); \
        fflush(stdout); \
    } while (0)

// NOTE: some x86 arches need fence instructions
//   0 -- no fence instructions
//   1 -- use mfence
//   2 -- use lfence/sfence
#if _USE_BARRIER_ == 0
#define BARRIER_RELEASE             ""
#define BARRIER_ACQUIRE             ""
#define BARRIER_ALL                 ""
#elif _USE_BARRIER_ == 1
#define BARRIER_ACQUIRE             "\tmfence\n"
#define BARRIER_RELEASE             "\tmfence\n"
#define BARRIER_ALL                 "\tmfence\n"
#elif _USE_BARRIER_ == 2
#define BARRIER_ACQUIRE             "\tlfence\n"
#define BARRIER_RELEASE             "\tsfence\n"
#define BARRIER_ALL                 "\tmfence\n"
#else
#error caslock: unknown barrier type
#endif

// barrier_acquire -- acquire barrier
inline_always void
barrier_acquire(void)
{

    __asm__ __volatile__ (
        BARRIER_ACQUIRE
        :
        :
        :   "memory");
}

// barrier_release -- release barrier
inline_always void
barrier_release(void)
{

    __asm__ __volatile__ (
        BARRIER_RELEASE
        :
        :
        :   "memory");
}

// barrier -- barrier
inline_always void
barrier(void)
{

    __asm__ __volatile__ (
        BARRIER_ALL
        :
        :
        :   "memory");
}

// casu32 -- compare and exchange four bytes
// RETURNS: 1=ok, 0=fail
inline_cas int
casu32(xptr32_p xptr,u32 oldval,u32 newval)
{
    char ok;

    __asm__ __volatile__ (
        "   lock\n"
        "   cmpxchg     %[newval],%[xptr]\n"
        "   sete        %[ok]\n"
        :   [ok] "=r" (ok),
            [xptr] "=m" (*xptr)
        :   "a" (oldval),
            [newval] "r" (newval)
        :   "memory");

    return ok;
}

// casu64 -- compare and exchange eight bytes
// RETURNS: 1=ok, 0=fail
inline_cas int
casu64(xptr64_p xptr,u64 oldval,u64 newval)
{
    char ok;

    __asm__ __volatile__ (
        "   lock\n"
        "   cmpxchg     %[newval],%[xptr]\n"
        "   sete        %[ok]\n"
        :   [ok] "=r" (ok),
            [xptr] "=m" (*xptr)
        :   "a" (oldval),
            [newval] "r" (newval)
        :   "memory");

    return ok;
}

// loadu32 -- load value with barrier
// RETURNS: loaded value
inline_always u32
loadu32(const xptr32_p xptr)
{
    u32 val;

    barrier_acquire();

    val = *xptr;

    return val;
}

// loadu64 -- load value with barrier
// RETURNS: loaded value
inline_always u64
loadu64(const xptr64_p xptr)
{
    u64 val;

    barrier_acquire();

    val = *xptr;

    return val;
}

// storeu32 -- store value with barrier
inline_always void
storeu32(xptr32_p xptr,u32 val)
{

    *xptr = val;

    barrier_release();
}

// storeu64 -- store value with barrier
inline_always void
storeu64(xptr64_p xptr,u64 val)
{

    *xptr = val;

    barrier_release();
}

// qsleep -- do a quick sleep
inline_always void
qsleep(int bigflg)
{
    struct timespec ts;

    if (bigflg) {
        ts.tv_sec = 1;
        ts.tv_nsec = 0;
    }
    else {
        ts.tv_sec = 0;
        ts.tv_nsec = 1000;
    }

    nanosleep(&ts,NULL);
}

// incby_tidgate -- increment by using thread id gate
void
incby_tidgate(tid_t tid)
// tid -- unique id for accessing entity (e.g. thread id)
{
    tid_t *gptr;
    tid_t oval;

    gptr = &tidgate;

    // acquire the gate
    while (1) {
        oval = 0;

        // test mode -- just do a nop instead of CAS to prove diagnostic
#if _USE_CASOFF_
        *gptr = oval;
        break;
#else
        if (tidcas(gptr,oval,tid))
            break;
#endif

        ++tskcur->tsk_casmiss;
    }

#if _USE_INCBARRIER_
    barrier_acquire();
#endif

    // increment the values
    xval += 1;
    yval += 1;

#if _USE_INCBARRIER_
    barrier_release();
#endif

    // release the gate
    // NOTE: CAS will always provide a barrier
#if _USE_CASPOST_ && (_USE_CASOFF_ == 0)
    oval = tidcas(gptr,tid,0);
#else
    tidstore(gptr,0);
#endif
}

// tskcld -- child task
void *
tskcld(void *arg)
{
    tid_t tid;
    tid_t oval;
    u64 loopcur;

    tskcur = arg;
    tid = tskcur->tsk_tid;

    // tell master thread that we're fully ready
    while (1) {
        oval = tidload(&readycnt);
        if (tidcas(&readycnt,oval,oval + 1))
            break;
    }

    // wait until we're given the starting gun
    while (1) {
        if (loadu64(&kickoff))
            break;
        qsleep(0);
    }

    // do the increments
    for (loopcur = loopmax;  loopcur > 0;  --loopcur)
        incby_tidgate(tid);

    barrier();

    // tell master thread that we're fully complete
    while (1) {
        oval = tidload(&donecnt);
        if (tidcas(&donecnt,oval,oval + 1))
            break;
    }

    return (void *) 0;
}

// tskstart -- start a child task
void
tskstart(tid_t tid)
{
    pthread_attr_t attr;
    pthread_t thr;
    int err;
    tsk_t *tsk;

    tsk = tsklist + tid;
    tsk->tsk_tid = tid;

    pthread_attr_init(&attr);
    pthread_attr_setdetachstate(&attr,1);
    err = pthread_create(&thr,&attr,tskcld,tsk);
    pthread_attr_destroy(&attr);

    if (err)
        printf("tskstart: error -- err=%d\n",err);
}

// tskall -- run a single test
void
tskall(void)
{
    tid_t tidcur;
    tsk_t *tsk;
    u64 incmax;
    u64 val;
    int err;

    xval = 0;
    yval = 0;

    kickoff = 0;
    readycnt = 0;
    donecnt = 0;
    tidgate = 0;

    // prealloc the task blocks
    tsklist = calloc(tidmax + 1,sizeof(tsk_t));

    // start all tasks
    PGR(" St");
    for (tidcur = 1;  tidcur <= tidmax;  ++tidcur)
        tskstart(tidcur);

    // wait for all tasks to be fully ready
    PGR(" Sw");
    while (1) {
        if (tidload(&readycnt) == tidmax)
            break;
        qsleep(1);
    }

    // the starting gun -- all tasks are waiting for this
    PGR(" Ko");
    storeu64(&kickoff,1);

    // wait for all tasks to be fully done
    PGR(" Wd");
    while (1) {
        if (tidload(&donecnt) == tidmax)
            break;
        qsleep(1);
    }

    PGR(" Done\n");

    // check the final count
    incmax = loopmax * tidmax;

    // show per-task statistics
    for (tidcur = 1;  tidcur <= tidmax;  ++tidcur) {
        tsk = tsklist + tidcur;
        printf("tskall: tsk=%llu tsk_casmiss=%d (%.3f%%)\n",
            (u64) tidcur,tsk->tsk_casmiss,(double) tsk->tsk_casmiss / loopmax);
    }

    err = 0;

    // check for failure
    val = loadu64(&xval);
    if (val != incmax) {
        printf("tskall: xval fault -- xval=%lld incmax=%lld\n",val,incmax);
        err = 1;
    }

    // check for failure
    val = loadu64(&yval);
    if (val != incmax) {
        printf("tskall: yval fault -- yval=%lld incmax=%lld\n",val,incmax);
        err = 1;
    }

    if (! err)
        printf("tskall: SUCCESS\n");

    free(tsklist);
}

// main -- master control
int
main(void)
{

    loopmax = LOOPMAX;
    tidmax = TIDMAX;

    inctype = 0;
    tskall();

    return 0;
}

注意：我可能已经破坏了一些asm约束，因为在将CAS函数作为内联进行编译时，使用# caslock/Makefile -- make file for caslock # # options: # LOOPMAX -- maximum loops / thread # # TIDMAX -- maximum number of threads # # BARRIER -- generate fence/barrier instructions # 0 -- none # 1 -- use mfence everywhere # 2 -- use lfence for acquire, sfence for release # # CASOFF -- disable CAS to prove diagnostic works # 0 -- normal mode # 1 -- inhibit CAS during X/Y increment # # CASINLINE -- inline the CAS functions # 0 -- do _not_ inline # 1 -- inline them (WARNING: this fails for gcc but works for clang!) # # CASPOST -- increment gate release mode # 0 -- use fenced store # 1 -- use CAS store (NOTE: not really required) # # INCBARRIER -- use extra barriers around increments # 0 -- rely on CAS for barrier # 1 -- add extra safety barriers immediately before increment of X/Y # # TID64 -- use 64 bit thread "id"s # 0 -- use 32 bit # 1 -- use 64 bit # # VPTR -- use volatile pointers in function definitions # 0 -- use ordinary pointers # 1 -- use volatile pointers (NOTE: not really required) ifndef _CASLOCK_MK_ _CASLOCK_MK_ = 1 OLIST += caslock.o ifndef LOOPMAX LOOPMAX = 1000000 endif ifndef TIDMAX TIDMAX = 20 endif ifndef BARRIER BARRIER = 0 endif ifndef CASINLINE CASINLINE = 0 endif ifndef CASOFF CASOFF = 0 endif ifndef CASPOST CASPOST = 0 endif ifndef INCBARRIER INCBARRIER = 0 endif ifndef TID64 TID64 = 0 endif ifndef VPTR VPTR = 0 endif CFLAGS += -DLOOPMAX=$(LOOPMAX) CFLAGS += -DTIDMAX=$(TIDMAX) CFLAGS += -D_USE_BARRIER_=$(BARRIER) CFLAGS += -D_USE_CASINLINE_=$(CASINLINE) CFLAGS += -D_USE_CASOFF_=$(CASOFF) CFLAGS += -D_USE_CASPOST_=$(CASPOST) CFLAGS += -D_USE_INCBARRIER_=$(INCBARRIER) CFLAGS += -D_USE_TID64_=$(TID64) CFLAGS += -D_USE_VPTR_=$(VPTR) STDLIB += -lpthread ALL += caslock CLEAN += caslock OVRPUB := 1 ifndef OVRTOP OVRTOP := $(shell pwd) OVRTOP := $(dir $(OVRTOP)) endif endif # ovrlib/rules.mk -- rules control # # options: # GDB -- enable debug symbols # 0 -- normal # 1 -- use -O0 and define _USE_GDB_=1 # # CLANG -- use clang instead of gcc # 0 -- use gcc # 1 -- use clang # # BNC -- enable benchmarks # 0 -- normal mode # 1 -- enable benchmarks for function enter/exit pairs ifdef OVRPUB ifndef SDIR SDIR := $(shell pwd) STAIL := $(notdir $(SDIR)) endif ifndef GENTOP GENTOP := $(dir $(SDIR)) endif ifndef GENDIR GENDIR := $(GENTOP)/$(STAIL) endif ifndef ODIR ODIR := $(GENDIR) endif PROTOLST := true PROTOGEN := @true endif ifndef SDIR $(error rules: SDIR not defined) endif ifndef ODIR $(error rules: ODIR not defined) endif ifndef GENDIR $(error rules: GENDIR not defined) endif ifndef GENTOP $(error rules: GENTOP not defined) endif ifndef _RULES_MK_ _RULES_MK_ = 1 CLEAN += *.proto CLEAN += *.a CLEAN += *.o CLEAN += *.i CLEAN += *.dis CLEAN += *.TMP QPROTO := $(shell $(PROTOLST) -i -l -O$(GENTOP) $(SDIR)/*.c $(CPROTO)) HDEP += $(QPROTO) ###VPATH += $(GENDIR) ###VPATH += $(SDIR) ifdef INCLUDE_MK -include $(INCLUDE_MK) endif ifdef GSYM CFLAGS += -gdwarf-2 endif ifdef GDB CFLAGS += -gdwarf-2 DFLAGS += -D_USE_GDB_ else CFLAGS += -O2 endif ifndef ZPRT DFLAGS += -D_USE_ZPRT_=0 endif ifdef BNC DFLAGS += -D_USE_BNC_=1 endif ifdef CLANG CC := clang endif DFLAGS += -I$(GENTOP) DFLAGS += -I$(OVRTOP) CFLAGS += -Wall -Werror CFLAGS += -Wno-unknown-pragmas CFLAGS += -Wempty-body CFLAGS += -fno-diagnostics-color # NOTE: we now need this to prevent inlining (enabled at -O2) ifndef CLANG CFLAGS += -fno-inline-small-functions endif # NOTE: we now need this to prevent inlining (enabled at -O3) CFLAGS += -fno-inline-functions CFLAGS += $(DFLAGS) endif all: $(PREP) proto $(ALL) %.o: %.c $(HDEP) $(CC) $(CFLAGS) -c -o $*.o $< %.i: %.c cpp $(DFLAGS) -P $*.c > $*.i %.s: %.c $(CC) $(CFLAGS) -S -o $*.s $< # build a library (type (2) build) $(LIBNAME):: $(OLIST) ar rv $@ $(OLIST) .PHONY: proto proto:: $(PROTOGEN) -i -v -O$(GENTOP) $(SDIR)/*.c $(CPROTO) .PHONY: clean clean:: rm -f $(CLEAN) .PHONY: help help:: egrep '^#' Makefile caslock:: $(OLIST) $(LIBLIST) $(STDLIB) $(CC) $(CFLAGS) -o caslock $(OLIST) $(LIBLIST) $(STDLIB)进行编译会产生不正确的结果。但是，gcc可以使用内联工作正常。因此，默认情况下CAS函数不内联。为了保持一致性，我没有使用gcc / clang的不同默认值，即使我可以。

这里是由clang构建的内联相关函数的反汇编（失败）：

gcc

这里是由00000000004009c0 <incby_tidgate>: 4009c0: 31 c0 xor %eax,%eax 4009c2: f0 0f b1 3d 3a 1a 20 lock cmpxchg %edi,0x201a3a(%rip) # 602404 <tidgate> 4009c9: 00 4009ca: 0f 94 c2 sete %dl 4009cd: 84 d2 test %dl,%dl 4009cf: 75 23 jne 4009f4 <L01> 4009d1: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 4009d8:L00 64 48 8b 14 25 f8 ff mov %fs:0xfffffffffffffff8,%rdx 4009df: ff ff 4009e1: 83 42 04 01 addl $0x1,0x4(%rdx) 4009e5: f0 0f b1 3d 17 1a 20 lock cmpxchg %edi,0x201a17(%rip) # 602404 <tidgate> 4009ec: 00 4009ed: 0f 94 c2 sete %dl 4009f0: 84 d2 test %dl,%dl 4009f2: 74 e4 je 4009d8 <L00> 4009f4:L01 48 83 05 dc 17 20 00 addq $0x1,0x2017dc(%rip) # 6021d8 <ary+0xf8> 4009fb: 01 4009fc: 48 83 05 94 19 20 00 addq $0x1,0x201994(%rip) # 602398 <ary+0x2b8> 400a03: 01 400a04: c7 05 f6 19 20 00 00 movl $0x0,0x2019f6(%rip) # 602404 <tidgate> 400a0b: 00 00 00 400a0e: c3 retq构建的内联相关函数的反汇编（成功）：

clang

使用CAS以原子方式递增两个整数

3 个答案: