Question

这是我编写的并发队列，我计划在我正在编写的线程池中使用。我想知道我能否做出任何性能改进。如果你很好奇，可以在下方粘贴atomic_counter！

#ifndef NS_CONCURRENT_QUEUE_HPP_INCLUDED
#define NS_CONCURRENT_QUEUE_HPP_INCLUDED

#include <ns/atomic_counter.hpp>
#include <boost/noncopyable.hpp>
#include <boost/smart_ptr/detail/spinlock.hpp>
#include <cassert>
#include <cstddef>

namespace ns {
    template<typename T,
             typename mutex_type = boost::detail::spinlock,
             typename scoped_lock_type = typename mutex_type::scoped_lock>
    class concurrent_queue : boost::noncopyable {
        struct node {
            node * link;
            T const value;
            explicit node(T const & source) : link(0), value(source) { }
        };
        node * m_front;
        node * m_back;
        atomic_counter m_counter;
        mutex_type m_mutex;
    public:
        // types
        typedef T value_type;

        // construction
        concurrent_queue() : m_front(0), m_mutex() { }
        ~concurrent_queue() { clear(); }

        // capacity
        std::size_t size() const { return m_counter; }
        bool empty() const { return (m_counter == 0); }

        // modifiers
        void push(T const & source);
        bool try_pop(T & destination);
        void clear();
    };

    template<typename T, typename mutex_type, typename scoped_lock_type>
    void concurrent_queue<T, mutex_type, scoped_lock_type>::push(T const & source) {
        node * hold = new node(source);
        scoped_lock_type lock(m_mutex);
        if (empty())
            m_front = hold;
        else
            m_back->link = hold;
        m_back = hold;
        ++m_counter;
    }

    template<typename T, typename mutex_type, typename scoped_lock_type>
    bool concurrent_queue<T, mutex_type, scoped_lock_type>::try_pop(T & destination) {
        node const * hold;
        {
            scoped_lock_type lock(m_mutex);
            if (empty())
                return false;
            hold = m_front;
            if (m_front == m_back)
                m_front = m_back = 0;
            else
                m_front = m_front->link;
            --m_counter;
        }
        destination = hold->value;
        delete hold;
        return true;
    }

    template<typename T, typename mutex_type, typename scoped_lock_type>
    void concurrent_queue<T, mutex_type, scoped_lock_type>::clear() {
        node * hold;
        {
            scoped_lock_type lock(m_mutex);
            hold = m_front;
            m_front = 0;
            m_back = 0;
            m_counter = 0;
        }
        if (hold == 0)
            return;
        node * it;
        while (hold != 0) {
            it = hold;
            hold = hold->link;
            delete it;
        }
    }
}

#endif

atomic_counter.hpp

#ifndef NS_ATOMIC_COUNTER_HPP_INCLUDED
#define NS_ATOMIC_COUNTER_HPP_INCLUDED

#include <boost/interprocess/detail/atomic.hpp>
#include <boost/noncopyable.hpp>

namespace ns {
    class atomic_counter : boost::noncopyable {
        volatile boost::uint32_t m_count;
    public:
        explicit atomic_counter(boost::uint32_t value = 0) : m_count(value) { }

        operator boost::uint32_t() const {
            return boost::interprocess::detail::atomic_read32(const_cast<volatile boost::uint32_t *>(&m_count));
        }

        void operator=(boost::uint32_t value) {
            boost::interprocess::detail::atomic_write32(&m_count, value);
        }

        void operator++() {
            boost::interprocess::detail::atomic_inc32(&m_count);
        }

        void operator--() {
            boost::interprocess::detail::atomic_dec32(&m_count);
        }
    };
}

#endif

Answer 1

我认为在这种情况下，由于为每个新节点调用new，您将遇到链接列表的性能问题。这不仅仅是因为调用动态内存分配器很慢。这是因为调用它经常会引入大量的并发开销，因为免费存储必须在多线程环境中保持一致。

我会使用一个向量调整大小的向量，当它太小而无法容纳队列时。我永远不会把它调整得更小。

我会安排前后值，这样矢量就是一个环形缓冲区。这将要求您在调整大小时移动元素。但这应该是一个相当罕见的事件，并且可以通过在构造时给出建议的矢量大小来在一定程度上减轻。

或者，您可以保留链表结构，但永远不要销毁节点。只需将其添加到自由节点的队列中即可。不幸的是，免费节点的队列需要锁定才能正常管理，而且我不确定你是否真的比你一直调用delete和new更好。

您还可以使用矢量获得更好的参考局部性。但我并不认为它会如何与必须在CPU之间来回穿梭的缓存线相互作用。

其他人建议使用::std::deque并且我认为这不是一个坏主意，但我怀疑环形缓冲区向量是一个更好的主意。

Answer 2

Herb Sutter提出了一个无锁队列的实现，肯定会超越你的队列：）

主要思想是使用缓冲环，在队列运行期间完全放弃内存的动态分配。这意味着队列可能已满（因此您可能需要等待放入一个元素），这在您的情况下可能是不可接受的。

正如Omnifarious指出的那样，最好不要使用链表（用于缓存局部性），除非你为池分配。我会尝试使用std::deque作为后端，它更加内存友好并保证只要你只是弹出和推送（在前面和后面）就不会有任何重新分配，这就是队列的情况通常

批评我的并发队列

2 个答案: