Question

#ifndef THREADPOOL_H
#define THREADPOOL_H
#include <iostream>
#include <deque>
#include <functional>
#include <thread>
#include <condition_variable>
#include <mutex>
#include <atomic>
#include <vector>

//thread pool
class ThreadPool
{
public:
    ThreadPool(unsigned int n = std::thread::hardware_concurrency())
        : busy()
        , processed()
        , stop()
    {
        for (unsigned int i=0; i<n; ++i)
            workers.emplace_back(std::bind(&ThreadPool::thread_proc, this));
    }

    template<class F> void enqueue(F&& f)
    {
        std::unique_lock<std::mutex> lock(queue_mutex);
        tasks.emplace_back(std::forward<F>(f));
        cv_task.notify_one();
    }

    void waitFinished()
    {
        std::unique_lock<std::mutex> lock(queue_mutex);
        cv_finished.wait(lock, [this](){ return tasks.empty() && (busy == 0); });
    }

    ~ThreadPool()
    {
        // set stop-condition
        std::unique_lock<std::mutex> latch(queue_mutex);
        stop = true;
        cv_task.notify_all();
        latch.unlock();

        // all threads terminate, then we're done.
        for (auto& t : workers)
            t.join();
    }

    unsigned int getProcessed() const { return processed; }

private:
    std::vector< std::thread > workers;
    std::deque< std::function<void()> > tasks;
    std::mutex queue_mutex;
    std::condition_variable cv_task;
    std::condition_variable cv_finished;
    unsigned int busy;
    std::atomic_uint processed;
    bool stop;

    void thread_proc()
    {
        while (true)
        {
            std::unique_lock<std::mutex> latch(queue_mutex);
            cv_task.wait(latch, [this](){ return stop || !tasks.empty(); });
            if (!tasks.empty())
            {
                // got work. set busy.
                ++busy;

                // pull from queue
                auto fn = tasks.front();
                tasks.pop_front();

                // release lock. run async
                latch.unlock();

                // run function outside context
                fn();
                ++processed;

                latch.lock();
                --busy;
                cv_finished.notify_one();
            }
            else if (stop)
            {
                break;
            }
        }
    }
};
#endif // THREADPOOL_H

我有使用闩锁的上述线程池实现。但是，每次通过 enqueue 调用添加任务时，开销都很大，大约需要100微秒。

如何提高线程池的性能？

Answer 1

您的代码看起来不错。您在问题中上面关于使用发行版优化进行编译的评论可能是正确的，您需要做的所有事情。

免责声明：在尝试提高代码性能之前，请始终先使用适当的工具来测量代码，以识别瓶颈所在。否则，您可能无法获得想要的改进。

但是我看到的是几个潜在的微观优化。

在您的thread_proc函数中更改

    while (true)
    {
        std::unique_lock<std::mutex> latch(queue_mutex);
        cv_task.wait(latch, [this](){ return stop || !tasks.empty(); });
        if (!tasks.empty())

对此：

    std::unique_lock<std::mutex> latch(queue_mutex);
    while (!stop)
    {
        cv_task.wait(latch, [this](){ return stop || !tasks.empty(); });
        while (!tasks.empty() && !stop)

然后删除else if (stop)块和函数的结尾。

其主要影响是，它避免了queue_mutex在latch的每次迭代中超出范围而导致的while上的额外“解锁”和“锁定”环。将if (!tasks.empty())更改为while (!tasks.empty())可能还可以通过让当前执行的具有量子能力的线程保持锁定并尝试出队下一个工作项来节省一两个周期。

<意见> 最后一件事。我一直认为notify应该在锁之外。这样，当刚更新队列的线程唤醒另一个线程时，就没有锁争用。但是我从来没有实际测量过这个假设，所以要加一点盐：

template<class F> void enqueue(F&& f)
{
    queue_mutex.lock();
        tasks.emplace_back(std::forward<F>(f));
    queue_mutex.unlock();
    cv_task.notify_one();
}

唤醒线程非常耗时

1 个答案: