Question

我最近写了一个有限的无锁队列，正在为它做一些测试。在测试中，一些线程产生素数（从某个数字开始，向上计数生成器线程对的数量的6倍，使用Deterministic Miller Rabin测试检查每个数字，并将质数插入队列中）和一些线程消耗素数（通过从队列中删除元素并检查它们是否为素数）。生产者线程成对出现，每对中有一个产生等于1 mod 6的素数，另一个产生等于5 mod 6的素数（所有数字等于0,2,3或4 mod 6除了2和3），主线程产生2和3.有一个全局计数器，表明没有生成多少线程。每次生产者线程或主要完成生成素数时，它会原子地递减此计数器。消费者线程在不为0时循环。

为了确定素数是否实际通过队列，我计算每个线程产生和消耗的素数的第0到第3个时刻，并检查生产者线程的时刻总和等于消费者线程的时刻。第n个时刻只是第n个幂的总和，所以这意味着素数的数量，它们的总和，它们的正方形的总和，以及它们的立方体的总和，都是匹配的。如果序列是彼此的排列，则所有时刻都将匹配，因此虽然我需要检查前n个以确保长度为n的序列实际上是排列，但前4个匹配意味着序列不匹配的可能性非常小。

我的无锁队列实际上有效，但由于某种原因，消费者线程全部停止，而队列中仍有元素。我不明白为什么因为生成器线程在将所有素数插入队列之后才减少生成计数器，并且生成计数器在所有生成线程递减之后只能等于0。因此，每当生成计数器为0时，所有元素都已插入队列中。但是如果消费者试图删除一个元素，它应该成功，因为如果queue.full（队列中的元素数量）为0，则删除失败。因此当生成计数器为0时，消费者应该能够成功消费直到queue.full为0，不应检查生成计数器并返回，直到队列耗尽为止。如果删除失败，它们只检查生成计数器（如果消费者比生产者更快并且清空队列）。

但是，当我使用while循环除了生成计数器之外还删除check queue.full时，消费者不会提前返回。也就是说，当我改变

时

__atomic_load_n(&producing, __ATOMIC_SEQ_CST)

到

__atomic_load_n(&producing, __ATOMIC_SEQ_CST) || __atomic_load_n(&Q.full, __ATOMIC_SEQ_CST)

它只是有效。请注意，我的代码使用了合理数量的gcc扩展，例如属性，__atomic builtins，__ auto_type，语句表达式，128位整数，__ builtin_ctzll和'\ e'，C99功能（如指定的初始值设定项和复合文字）以及pthreads。我也使用顺序一致的内存顺序和强大的比较和交换到处，即使较弱的版本应该工作，因为我不想要问题，因为我有这个问题。这是标题queue.h：

#ifndef __QUEUE_H__
#define __QUEUE_H__

#include <stddef.h>
#include <inttypes.h>

typedef struct __attribute__((__designated_init__)){//using positional initializers for a struct is terrible
    void *buf;
    uint8_t *flags;//insert started, insert complete, remove started
    size_t cap, full;
    uint64_t a, b;
} queue_t;

typedef struct __attribute__((__designated_init__)){
    size_t size;
} queue_ft;//this struct serves as a class for queue objects: any data specific to the object goes in the queue_t struct and any shared data goes here

int queue_insert(queue_t*, const queue_ft*, void *elem);

int queue_remove(queue_t*, const queue_ft*, void *out);

int queue_init(queue_t*, const queue_ft*, size_t reserve);

void queue_destroy(queue_t*, const queue_ft*);

#endif

这是库源queue.c：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include "queue.h"

int queue_insert(queue_t *self, const queue_ft *ft, void *elem){
    uint64_t i;
    while(1){
        uint8_t flag = 0;
        if(__atomic_load_n(&self->full, __ATOMIC_SEQ_CST) == self->cap){
            return 0;
        }
        i = __atomic_load_n(&self->b, __ATOMIC_SEQ_CST);
        if(__atomic_compare_exchange_n(self->flags + i, &flag, 0x80, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)){//set the insert started flag if all flags are clear
            break;
        }
    }
    __atomic_fetch_add(&self->full, 1, __ATOMIC_SEQ_CST);
    uint64_t b = i;
    while(!__atomic_compare_exchange_n(&self->b, &b, (b + 1)%self->cap, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST));//increase the b endpoint of the queue with wraparaound
    memcpy(self->buf + i*ft->size, elem, ft->size);//actually insert the item.  accesses to the buffer mirror accesses to the flags so this is safe
    __atomic_thread_fence(memory_order_seq_cst);
    __atomic_store_n(self->flags + i, 0xc0, __ATOMIC_SEQ_CST);//set the insert completed flag
    return 1;
}

int queue_remove(queue_t *self, const queue_ft *ft, void *out){
    uint64_t i;
    while(1){
        uint8_t flag = 0xc0;
        if(!__atomic_load_n(&self->full, __ATOMIC_SEQ_CST)){
            return 0;
        }
        i = __atomic_load_n(&self->a, __ATOMIC_SEQ_CST);
        if(__atomic_compare_exchange_n(self->flags + i, &flag, 0xe0, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)){//set the remove started flag if insert started and insert completed are set but the other flags are clear
            break;
        }
    }
    __atomic_fetch_sub(&self->full, 1, __ATOMIC_SEQ_CST);
    uint64_t a = i;
    while(!__atomic_compare_exchange_n(&self->a, &a, (a + 1)%self->cap, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST));//increase the a endpoint of the queue with wraparound
    memcpy(out, self->buf + i*ft->size, ft->size);//actually remove the item.
    __atomic_thread_fence(__ATOMIC_SEQ_CST);
    __atomic_store_n(self->flags + i, 0x00, __ATOMIC_SEQ_CST);//clear all the flags to mark the remove as completed
    return 1;
}

int queue_init(queue_t *self, const queue_ft *ft, size_t reserve){
    void *buf = malloc(reserve*ft->size);
    if(!buf){
        return 0;
    }
    uint8_t *flags = calloc(reserve, sizeof(uint8_t));
    if(!flags){
        free(buf);
        return 0;
    }
    *self = (queue_t){
        .buf=buf,
        .flags=flags,
        .cap=reserve,
        .full=0,
        .a=0,.b=0
    };
    return 1;
}

void queue_destroy(queue_t *self, const queue_ft *ft){
    free(self->buf);
    free(self->flags);
}

这是测试程序源test_queue_pc.c：

#define _POSIX_C_SOURCE 201612UL

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <pthread.h>
#include <math.h>
#include <time.h>
#include "queue.h"

//Generate primes up to this number.  Note 78498 is the number of primes below 1000000; this is hard coded because the queue does not support growing yet.
#define MAX 1000000
#define QUEUE_SIZE 78498
#define NUM_PRODUCER_PAIRS 3
#define NUM_CONSUMERS 2
//Every producer and consumer thread calculates the 0th through 3rd moments of the sequence of primes it sees, as well as testing them for primality.
//The nth moment is the sum of the nth powers, thus, the order does not matter and if the primes are the same in both the producers and the consumers
//then the sums of the moments will also be the same.  I check that the 0th through 3rd moments match which means it is nearly certain the primes go through
//the queue.
#define NUM_MOMENTS 4

//Deterministic Miller Rabin witnesses (see https://en.wikipedia.org/wiki/Miller–Rabin_primality_test)
#define DMR_PRIMES (uint64_t[]){2, 13, 23, 1662803}
#define DMR_PRIMES_C 4

//Macro to split an integer into three parts.  The first part has the 2**0, 2**3, 2**6, ..., 2**60 bits of the original and 0 elsewhere.
//The second part has the 2**1, 2**4, 2**7, ..., 2**61 bits of the original and 0 elsewhere.  The last part has the 2**2, ..., 2**62 bits.
//The 2**63 bit is lost.  The masks represent the sums of geometric sequences.  The original number can be obtained by bitwise or or xor on the parts.
//I spread the uint64_t's (which are unsigned long longs) over 3 uint64_t's so that they take up 24 bytes and memcpy'ing them happens in multiple steps.
//This spreading is only done on primes that have been produced before they are put into the queue.  The consumers then recombine and verify them.
#define SPREAD_EMPLACE(n) ({__auto_type _n = (n); &(spread_integer){(_n)&(((1ULL<<60)-1)/7), (_n)&(((1ULL<<61)-2)/7), (_n)&(((1ULL<<62)-4)/7)};})

typedef struct{
    uint64_t x, y, z;
} spread_integer;

queue_ft spread_integer_ft = {.size= sizeof(spread_integer)};

queue_t Q;
//Start producing count at 1 + (NUM_PRODUCING_THREADS << 1) because main generates 2 and 3 and reduce it by 1 every time a producer thread finishes
int producing = 1 + (NUM_PRODUCER_PAIRS << 1);

//Uses the binary algorithm for modular exponentiation (https://en.wikipedia.org/wiki/Exponentiation_by_squaring)
//It is a helper function for isPrime
uint64_t powmod(unsigned __int128 b, uint64_t e, uint64_t n){
    unsigned __int128 r = 1;
    b %= n;
    while(e){
        if(e&1){
            r = r*b%n;
        }
        e >>= 1;
        b = b*b%n;
    }
    return (uint64_t)r;
}

//uses deterministic Miller Rabin primality test
int isPrime(uint64_t n){
    uint64_t s, d;//s, d | 2^s*d = n - 1
    if(n%2 == 0){
        return n == 2;
    }
    --n;
    s = __builtin_ctzll(n);
    d = n>>s;
    ++n;
    for(uint64_t i = 0, a, x; i < DMR_PRIMES_C; ++i){
        a = DMR_PRIMES[i];
        if(a >= n){
            break;
        }
        x = powmod(a, d, n);
        if(x == 1 || x == n - 1){
            goto CONTINUE_WITNESSLOOP;
        }
        for(a = 0; a < s - 1; ++a){
            x = powmod(x, 2, n);
            if(x == 1){
                return 0;
            }
            if(x == n - 1){
                goto CONTINUE_WITNESSLOOP;
            }
        }
        return 0;
        CONTINUE_WITNESSLOOP:;
    }
    return 1;
}

void *produce(void *_moments){
    uint64_t *moments = _moments, n = *moments;//the output argument for the 0th moment serves as the input argument for the number to start checking for primes at
    *moments = 0;
    for(; n < MAX; n += 6*NUM_PRODUCER_PAIRS){//the producers are paired so one of every pair generates primes equal to 1 mod 6 and the other equal to 5 mod 6.  main generates 2 and 3 the only exceptions
        if(isPrime(n)){
            for(uint64_t m = 1, i = 0; i < NUM_MOMENTS; m *= n, ++i){
                moments[i] += m;
            }
            if(!queue_insert(&Q, &spread_integer_ft, SPREAD_EMPLACE(n))){
                fprintf(stderr, "\e[1;31mERROR: Could not insert into queue.\e[0m\n");
                exit(EXIT_FAILURE);
            }
        }
    }
    __atomic_fetch_sub(&producing, 1, __ATOMIC_SEQ_CST);//this thread is done generating primes; reduce producing counter by 1
    return moments;
}

void *consume(void *_moments){
    uint64_t *moments = _moments;
    while(__atomic_load_n(&producing, __ATOMIC_SEQ_CST) || __atomic_load_n(&Q.full, __ATOMIC_SEQ_CST)){//busy loop while some threads are producing
        spread_integer xyz;
        if(queue_remove(&Q, &spread_integer_ft, &xyz)){
            uint64_t n = xyz.x | xyz.y | xyz.z;
            if(isPrime(n)){
                for(uint64_t m = 1, i = 0; i < NUM_MOMENTS; m *= n, ++i){
                    moments[i] += m;
                }
            }else{
                fprintf(stderr, "\e[1;31mERROR: Generated a prime that fails deterministic Miller Rabin.\e[0m\n");
                exit(EXIT_FAILURE);
            }
        }
    }
    return moments;
}

int main(void){
    if(!queue_init(&Q, &spread_integer_ft, QUEUE_SIZE)){
        fprintf(stderr, "\e[1;31mERROR: Could not initialize queue.\e[0m\n");
        exit(EXIT_FAILURE);
    }
    pthread_t producers[NUM_PRODUCER_PAIRS << 1], consumers[NUM_CONSUMERS];
    uint64_t moments[(NUM_PRODUCER_PAIRS << 1) + 1 + NUM_CONSUMERS + 1][NUM_MOMENTS] = {};//the 2 extras are because main produces the primes 2 and 3 and consumes primes the consumers leave behind
    for(size_t i = 0; i < NUM_CONSUMERS; ++i){//create consumers first to increase likelihood of causing bugs
        if(pthread_create(consumers + i, NULL, consume, moments[(NUM_PRODUCER_PAIRS << 1) + 1 + i])){
            fprintf(stderr, "\e[1;31mERROR: Could not create consumer thread.\e[0m\n");
            exit(EXIT_FAILURE);
        }
    }
    for(size_t i = 0; i < NUM_PRODUCER_PAIRS; ++i){
        moments[i << 1][0] = 5 + 6*i;
        if(pthread_create(producers + (i << 1), NULL, produce, moments[i << 1])){
            fprintf(stderr, "\e[1;31mERROR: Could not create producer thread.\e[0m\n");
            exit(EXIT_FAILURE);
        }
        moments[(i << 1) + 1][0] = 7 + 6*i;
        if(pthread_create(producers + (i << 1) + 1, NULL, produce, moments[(i << 1) + 1])){
            fprintf(stderr, "\e[1;31mERROR: Could not create producer thread.\e[0m\n");
            exit(EXIT_FAILURE);
        }
    }
    for(uint64_t n = 2; n < 4; ++n){
        for(uint64_t m = 1, i = 0; i < NUM_MOMENTS; m *= n, ++i){
            moments[NUM_PRODUCER_PAIRS << 1][i] += m;
        }
        if(!queue_insert(&Q, &spread_integer_ft, SPREAD_EMPLACE(n))){
            fprintf(stderr, "\e[1;31mERROR: Could not insert into queue.\e[0m\n");
            exit(EXIT_FAILURE);
        }
    }
    __atomic_fetch_sub(&producing, 1, __ATOMIC_SEQ_CST);
    uint64_t c = 0;
    for(size_t i = 0; i < NUM_CONSUMERS; ++i){//join consumers first to bait bugs.  Note consumers should not finish until the producing counter reaches 0
        void *_c;
        if(pthread_join(consumers[i], &_c)){
            fprintf(stderr, "\e[1;31mERROR: Could not join consumer thread.\e[0m\n");
            exit(EXIT_FAILURE);
        }
        c += (uintptr_t)_c;
    }
    for(size_t i = 0; i < NUM_PRODUCER_PAIRS << 1; ++i){
        if(pthread_join(producers[i], NULL)){
            fprintf(stderr, "\e[1;31mERROR: Could not join producer thread.\e[0m\n");
            exit(EXIT_FAILURE);
        }
    }
    //this really should not be happening because the consumer threads only return after the producing counter reaches 0,
    //which only happens after all of the producer threads are done inserting items into the queue.
    if(Q.full){
        fprintf(stdout, "\e[1;31mWTF: Q.full != 0\nproducing == %d\e[0m\n", producing);
    }
    while(Q.full){
        spread_integer xyz;
        if(!queue_remove(&Q, &spread_integer_ft, &xyz)){
            fprintf(stderr, "\e[1;31mERROR: Could not remove from non empty queue.\e[0m\n");
            exit(EXIT_FAILURE);
        }
        uint64_t n = xyz.x | xyz.y | xyz.z;
        if(isPrime(n)){
            for(uint64_t m = 1, i = 0; i < NUM_MOMENTS; m *= n, ++i){
                moments[(NUM_PRODUCER_PAIRS << 1) + 1 + NUM_CONSUMERS][i] += m;
            }
        }else{
            fprintf(stderr, "\e[1;31mERROR: Generated a prime that fails deterministic Miller Rabin.\e[0m\n");
            exit(EXIT_FAILURE);
        }
    }
    queue_destroy(&Q, &spread_integer_ft);
    for(uint64_t i = 0, p, c, j; i < NUM_MOMENTS; ++i){
        for(j = p = 0; j < (NUM_PRODUCER_PAIRS << 1) + 1; ++j){
            p += moments[j][i];
        }
        for(c = 0; j < (NUM_PRODUCER_PAIRS << 1) + 1 + NUM_CONSUMERS + 1; ++j){
            c += moments[j][i];
        }
        printf("Moment %"PRIu64" %"PRIu64" -> %"PRIu64"\n", i, p, c);
    }
}

然后我用

编译

gcc -o test_queue_pc queue.c test_queue_pc.c -Wall -std=c99 -g -O0 -pthread -fuse-ld=gold -flto -lm

为什么消费者线程在队列为空之前返回，即使他们等待生产者完成，当他们只在producing上循环时，但在他们循环producing || Q.full时做正确的事情？

Answer 1

为什么消费者线程在队列为空之前返回，即使他们等待生成器完成，当他们只是在生成时循环，但是当他们循环生成时，做正确的事情。 Q.full？

因为没有更多的生产者意味着没有新的条目被添加到队列中; 不表示队列已经为空。

考虑生产者比消费者更快的情况。他们将他们的东西添加到队列中，然后退出。此时，队列中有项目，但活动生成器计数为零。如果消费者只检查是否有活跃的生产者，他们会错过队列中已有的物品。

重要的是要注意检查

if ((active producers) || (items in queue))

是C99中的正确答案。（||运算符在计算左侧后有一个序列点。也就是说，在左侧之前从不评估右侧。）

如果您只检查活动的生产者，则会错过生产者比消费者更快的情况，并在队列中仍有物品时退出。

如果您只检查队列中的项目，则会错过生产者仍在向队列添加内容的情况。

如果先检查队列是否为空，则打开一个竞赛窗口。在消费者检查队列是否为空之后，但在消费者检查是否存在活动生产者之前，生产者可以将一个或多个项目添加到队列并退出。

您需要先检查是否有活跃的生产者。如果存在活动生成器，并且队列现在为空，则消费者必须等待新项目到达队列（直到活动生成器计数降至零，或者新项目到达队列中。）如果没有活动生产者，消费者必须检查队列中是否有物品。没有活动的生成器意味着队列中不会出现新项目，但这并不意味着队列已经为空。

为什么我的消费者线程在我的生产者线程完成之前就停止了？

1 个答案: