Question

我试图实现一个使用线性循环缓冲区来存储数据的无锁队列。与通用无锁队列相比，我有以下放松条件：

我知道将在队列中存储的最坏情况的元素数量。队列是在固定元素集上运行的系统的一部分。代码永远不会尝试在队列中存储更多元素，因为此固定集中有元素。
没有多生产者/多用户。该队列将用于多生产者/单一消费者或单生产者/多消费者设置。

从概念上讲，队列实现如下

标准二次幂环形缓冲区。底层数据结构是使用power-of-two trick的标准环形缓冲区。读取和写入索引只会递增。当使用简单的位掩码索引到数组时，它们被钳制到底层数组的大小。读指针在pop()中以原子方式递增，写指针在push()中以原子方式递增。
尺寸变量可以访问pop()。另外一个＆＃34;尺寸＆＃34;变量跟踪队列中的元素数量。这消除了对读写索引执行算术的需要。在整个写入操作发生之后，大小变量以原子方式递增，即数据已写入后备存储并且写入游标已递增。我使用compare-and-swap (CAS)操作以原子方式递减pop()中的大小，并且仅在大小非零时才继续。这种方式pop()应保证返回有效数据。

我的队列实现如下。请注意，只要pop()尝试读取先前由push()写入的内存，就会暂停执行的调试代码。这应该永远不会发生，因为 - 至少在概念上 - pop()只有在队列中有元素时才会进行（不应该有下溢）。

#include <atomic>
#include <cstdint>
#include <csignal> // XXX for debugging

template <typename T>
class Queue {
private:
    uint32_t m_data_size;   // Number of elements allocated
    std::atomic<T> *m_data; // Queue data, size is power of two
    uint32_t m_mask;        // Bitwise AND mask for m_rd_ptr and m_wr_ptr
    std::atomic<uint32_t> m_rd_ptr; // Circular buffer read pointer
    std::atomic<uint32_t> m_wr_ptr; // Circular buffer write pointer
    std::atomic<uint32_t> m_size;   // Number of elements in the queue

    static uint32_t upper_power_of_two(uint32_t v) {
        v--; // https://graphics.stanford.edu/~seander/bithacks.html
        v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16;
        v++;
        return v;
    }

public:
    struct Optional { // Minimal replacement for std::optional
        bool good;
        T value;
        Optional() : good(false) {}
        Optional(T value) : good(true), value(std::move(value)) {}
        explicit operator bool() const { return good; }
    };

    Queue(uint32_t max_size)
        : // XXX Allocate 1 MiB of additional memory for debugging purposes
          m_data_size(upper_power_of_two(1024 * 1024 + max_size)),
          m_data(new std::atomic<T>[m_data_size]),
          m_mask(m_data_size - 1),
          m_rd_ptr(0),
          m_wr_ptr(0),
          m_size(0) {
        // XXX Debug code begin
        // Fill the memory with a marker so we can detect invalid reads
        for (uint32_t i = 0; i < m_data_size; i++) {
            m_data[i] = 0xDEADBEAF;
        }
        // XXX Debug code end
    }

    ~Queue() { delete[] m_data; }

    Optional pop() {
        // Atomically decrement the size variable
        uint32_t size = m_size.load();
        while (size != 0 && !m_size.compare_exchange_weak(size, size - 1)) {
        }

        // The queue is empty, abort
        if (size <= 0) {
            return Optional();
        }

        // Read the actual element, atomically increase the read pointer
        T res = m_data[(m_rd_ptr++) & m_mask].load();

        // XXX Debug code begin
        if (res == T(0xDEADBEAF)) {
            std::raise(SIGTRAP);
        }
        // XXX Debug code end
        return res;
    }

    void push(T t) {
        m_data[(m_wr_ptr++) & m_mask].store(t);
        m_size++;
    }

    bool empty() const { return m_size == 0; }
};

然而，确实发生了下溢，并且可以在多线程压力测试中轻松触发。在此特定测试中，我维护了两个队列q1和q2。在主线程中，我将固定数量的元素提供给q1。从q1读取两个工作线程并在紧密循环中推送到q2。主线程从q2读取数据并将其反馈给q1。

如果只有一个工作线程（单生产者/单一用户）或只要所有工作线程与主线程在同一个CPU上，这样就可以正常工作。但是，只要有两个工作线程显式调度到与主线程不同的CPU上，它就会失败。

以下代码实现了此测试

#include <pthread.h>
#include <thread>
#include <vector>

static void queue_stress_test_main(std::atomic<uint32_t> &done_count,
                                   Queue<int> &queue_rd, Queue<int> &queue_wr) {
    for (size_t i = 0; i < (1UL << 24); i++) {
        auto res = queue_rd.pop();
        if (res) {
            queue_wr.push(res.value);
        }
    }
    done_count++;
}

static void set_thread_affinity(pthread_t thread, int cpu) {
    cpu_set_t cpuset;
    CPU_ZERO(&cpuset);
    CPU_SET(cpu, &cpuset);
    if (pthread_setaffinity_np(thread, sizeof(cpu_set_t),
                               &cpuset) != 0) {
        throw "Error while calling pthread_setaffinity_np";
    }
}

int main() {
    static constexpr uint32_t n_threads{2U}; // Number of worker threads
    //static constexpr uint32_t n_threads{1U}; // < Works fine
    static constexpr uint32_t max_size{16U}; // Elements in the queue
    std::atomic<uint32_t> done_count{0};     // Number of finished threads
    Queue<int> queue1(max_size), queue2(max_size);

    // Launch n_threads threads, make sure the main thread and the two worker
    // threads are on different CPUs.
    std::vector<std::thread> threads;
    for (uint32_t i = 0; i < n_threads; i++) {
        threads.emplace_back(queue_stress_test_main, std::ref(done_count),
                             std::ref(queue1), std::ref(queue2));
        set_thread_affinity(threads.back().native_handle(), 0);
    }
    set_thread_affinity(pthread_self(), 1);
    //set_thread_affinity(pthread_self(), 0); // < Works fine

    // Pump data from queue2 into queue1
    uint32_t elems_written = 0;
    while (done_count < n_threads || !queue2.empty()) {
        // Initially fill queue1 with all values from 0..max_size-1
        if (elems_written < max_size) {
            queue1.push(elems_written++);
        }

        // Read elements from queue2 and put them into queue1
        auto res = queue2.pop();
        if (res) {
            queue1.push(res.value);
        }
    }

    // Wait for all threads to finish
    for (uint32_t i = 0; i < n_threads; i++) {
        threads[i].join();
    }
}

此程序大多数时间触发队列代码中的陷阱，这意味着pop()尝试读取push()从未触及的内存 - 尽管pop() 只有在push()被调用至少与pop()一样频繁时才 。

您可以使用

在Linux上使用GCC / clang编译和运行上述程序

c++ -std=c++11 queue.cpp -o queue -lpthread && ./queue

要么连接上面两个代码块，要么下载完整的程序here。

请注意，在无锁数据结构方面，我是一个完整的新手。我完全清楚C ++有很多经过实战考验的无锁队列实现。但是，我只是无法弄清楚为什么上面的代码无法正常工作。

Answer 1

你有两个错误，其中一个可能导致你观察到的失败。

让我们看看你的推送代码，除了我们每个语句只允许一个操作：

void push(T t)
{
    auto const claimed_index = m_wr_ptr++;               /* 1 */
    auto const claimed_offset = claimed_index & m_mask; /* 2 */
    auto& claimed_data = m_data[claimed_offset];         /* 3 */
    claimed_data.store(t);                               /* 4 */
    m_size++;                                            /* 5 */
}

现在，对于具有两个生产者的队列，在操作1和4之间存在一个易受竞争条件影响的窗口：

在：

m_rd_ptr == 1
m_wr_ptr == 1
m_size == 0

制片人A：

/* 1 */ claimed_index = 1; m_wr_ptr = 2;
/* 2 */ claimed_offset = 1;

计划程序让制作人A在这里睡觉

制片人B：

/* 1 */ claimed_index = 2; m_wr_ptr = 3;
/* 2 */ claimed_offset = 2;
/* 3 */ claimed_data = m_data[2];
/* 4 */ claimed_data.store(t);
/* 5 */ m_size = 1;

后：

m_size == 1
m_rd_ptr == 1
m_wr_ptr == 3
m_data[1] == 0xDEADBEAF
m_data[2] == value_produced_by_B

消费者现在正在运行，看到m_size > 0，并从m_data[1]读取，同时将m_rd_ptr从1增加到2.但m_data[1]尚未由制作人A编写，制片人B写信给m_data[2]。

当pop()操作和m_rd_ptr++调用之间的消费者线程被中断时，第二个错误是.load()中的补充案例。它可能导致无序读取值，这可能导致队列完全盘旋并覆盖原始值。

仅仅因为单个源语句中的两个操作是原子的，不会使整个语句成为原子。

C ++无锁队列实现中的虚假下溢

1 个答案: