QtConcurrent :: map显示没有任何好处

时间:2017-05-24 14:13:40

标签: c++ multithreading qt qtconcurrent

我想使用QVector函数操纵QtConcurrent::map。我的所有示例程序都是将QVector中的所有值增加1。

QVector<double> arr(10000000, 0);
QElapsedTimer timer;
qDebug() << QThreadPool::globalInstance()->maxThreadCount() << "Threads";

int end;
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
timer.start();
for(int i = 0; i < 100; ++i) {
    std::transform(arr.begin(), arr.end(), arr.begin(), [](double x){ return ++x; });
}
end = timer.elapsed();
qDebug() << end;
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
timer.start();
for(int i = 0; i < 100; ++i) {
    std::for_each(arr.begin(), arr.end(), [](double &x){ ++x; });
}
end = timer.elapsed();
qDebug() << end;
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
timer.start();
for(int i = 0; i < 100; ++i) {
    QFuture<void> qf = QtConcurrent::map(arr.begin(), arr.end(), [](double &x){ ++x; });
    qf.waitForFinished();
}
end = timer.elapsed();
qDebug() << end;

然而程序输出

4 Threads
905 // std::transform
886 // std::for_each
876 // QtConcurrent::map

因此多线程版本几乎没有速度优势。我确认实际上有4个线程正在运行。我用-O2优化。更常见的QThreadPool方法更适合这种情况吗?

修改

我使用QtConcurrent::run()尝试了一种不同的方法。以下是程序代码的相关部分:

void add1(QVector<double>::iterator first, QVector<double>::iterator last) {
    for(; first != last; ++first) {
        *first += 1;
    }
}

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
std::for_each(arr.begin(), arr.end(), [](double &x){ ++x; });
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
QFuture<void> qf[numThreads];
for(int j = 0; j < numThreads; ++j) {
    qf[j] = QtConcurrent::run(add1, arr.begin()+j*n/numThreads, arr.begin()+(j+1)*n/numThreads-1);
}
for(int j = 0; j < numThreads; ++j) {
    qf[j].waitForFinished();
}

所以我手动将任务分配到不同的线程上。但我仍然没有得到性能提升:

181 ms // std::for_each
163 ms // QtConcurrent::run

这里还有什么问题?

1 个答案:

答案 0 :(得分:4)

与简单地使用C ++线程原语和自己动手的线程池相比,QtConcurrent似乎有很高的开销。

let server = app.listen(process.env.PORT) 
var io = require('socket.io').listen(server) 
global.io = io

Live example

这会产生:

template<class T>
struct threaded_queue {
  using lock = std::unique_lock<std::mutex>;
  void push_back( T t ) {
    {
      lock l(m);
      data.push_back(std::move(t));
    }
    cv.notify_one();
  }
  boost::optional<T> pop_front() {
    lock l(m);
    cv.wait(l, [this]{ return abort || !data.empty(); } );
    if (abort) return {};
    auto r = std::move(data.back());
    data.pop_back();
    return std::move(r);
  }
  void terminate() {
    {
      lock l(m);
      abort = true;
      data.clear();
    }
    cv.notify_all();
  }
  ~threaded_queue()
  {
    terminate();
  }
private:
  std::mutex m;
  std::deque<T> data;
  std::condition_variable cv;
  bool abort = false;
};
struct thread_pool {
  thread_pool( std::size_t n = 1 ) { start_thread(n); }
  thread_pool( thread_pool&& ) = delete;
  thread_pool& operator=( thread_pool&& ) = delete;
  ~thread_pool() = default; // or `{ terminate(); }` if you want to abandon some tasks
  template<class F, class R=std::result_of_t<F&()>>
  std::future<R> queue_task( F task ) {
    std::packaged_task<R()> p(std::move(task));
    auto r = p.get_future();
    tasks.push_back( std::move(p) );
    return r;
  }
  template<class F, class R=std::result_of_t<F&()>>
  std::future<R> run_task( F task ) {
    if (threads_active() >= total_threads()) {
      start_thread();
    }
    return queue_task( std::move(task) );
  }
  void terminate() {
    tasks.terminate();
  }
  std::size_t threads_active() const {
    return active;
  }
  std::size_t total_threads() const {
    return threads.size();
  }
  void clear_threads() {
    terminate();
    threads.clear();
  }
  void start_thread( std::size_t n = 1 ) {
    while(n-->0) {
      threads.push_back(
        std::async( std::launch::async,
          [this]{
            while(auto task = tasks.pop_front()) {
              ++active;
              try{
                (*task)();
              } catch(...) {
                --active;
                throw;
              }
              --active;
            }
          }
        )
      );
    }
  }
private:
  std::vector<std::future<void>> threads;
  threaded_queue<std::packaged_task<void()>> tasks;
  std::atomic<std::size_t> active = {};
};

struct my_timer_t {
    std::chrono::high_resolution_clock::time_point first;
    std::chrono::high_resolution_clock::duration duration;

    void start() {
        first = std::chrono::high_resolution_clock::now();
    }
    std::chrono::high_resolution_clock::duration finish() {
        return duration = std::chrono::high_resolution_clock::now()-first;
    }
    unsigned long long ms() const {
        return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
    }
};
int main() {
    std::vector<double> arr(1000000, 0);
    my_timer_t timer;

    /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
    timer.start();
    for(int i = 0; i < 100; ++i) {
        std::transform(arr.begin(), arr.end(), arr.begin(), [](double x){ return ++x; });
    }
    timer.finish();
    auto time_transform = timer.ms();
    std::cout << time_transform << "<- std::transform (" << arr[rand()%arr.size()] << ")\n";
    /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
    timer.start();
    for(int i = 0; i < 100; ++i) {
        std::for_each(arr.begin(), arr.end(), [](double &x){ ++x; });
    }
    timer.finish();
    auto time_for_each = timer.ms();
    std::cout << time_for_each << "<- std::for_each (" << arr[rand()%arr.size()] << ")\n";
    /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
    enum { num_threads = 8 };
    thread_pool pool(num_threads);
    timer.start();
    for(int i = 0; i < 100; ++i) {
        std::array< std::future<void>, num_threads > tasks;
        for (int t = 0; t < num_threads; ++t) {
            tasks[t] = pool.run_task([&,t]{
                std::for_each( arr.begin()+(arr.size()/num_threads)*t, arr.begin()+(arr.size()/num_threads)*(t+1), [](double& x){++x;} );
            });
        }
        // std::cout << "loop! -- " << pool.threads_active() << "/" << pool.total_threads() << std::endl;
        for (int t = 0; t < num_threads; ++t)
            tasks[t].wait();
    }
    timer.finish();
    auto time_pool = timer.ms();
    std::cout << time_pool << "<- thread_pool (" << arr[rand()%arr.size()] << ")\n";
}

使用简单的C ++ 11线程池以8种方式分割任务时的显着加速。 (以4种方式分割任务时大约是105)。

现在我确实使用了比你的小10倍的测试集,因为当我的程序花费很长时间运行时,在线系统会超时。

与您的线程池系统进行通信会有开销,但我的天真线程池不应该超出这样的真实库。

现在,一个严重的问题是你可能是内存IO限制;如果你们都必须等待字节,那么更多线程更快地访问字节将无济于事。