Question

我正在尝试在C ++中实现一个call-by-future机制。虽然这只是一个测试代码（有点匆忙），但我打算使用类似于我正在研究的语言运行时的透明并行性。

我已经干掉了我正在努力使其变得更小的代码，尽管它仍然很大：

#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <vector>
#include <queue>
#include <future>
#include <thread>
#include <functional>
#include <type_traits>
#include <utility>
using namespace std;
using namespace std::chrono;

//------------------------------------------------------------------------------
// Simple locked printer

static std::recursive_mutex print_lock;

inline void print_() {
  return;
};

template<typename T, typename... Args>
inline void print_(T t, Args... args) {
  print_lock.lock();
  std::cout << t;
  print_(args...);
  print_lock.unlock();
};
//------------------------------------------------------------------------------

template<typename R>
class PooledTask {
  public:
    explicit PooledTask(function<R()>);

    // Possibly execute the task and return the value
    R &operator () () {

      // If we can get the lock, we're not executing
      if(lock.try_lock()) {

        // We may already have executed it
        if(done)
          goto end;

        // Otherwise, execute it now
        try {
          result = move(task());
        } catch(...) {
          // If an exception is thrown, save it for later
          eptr = current_exception();
          failed = true;
        };

        done = true;

        goto end;

      } else {

        // Wait until the task is completed
        lock.lock();

        end: {
          lock.unlock();

          // Maybe we got an exception!
          if(failed)
            rethrow_exception(eptr);

          // Otherwise, just return the result
          return result;
        };
      };
    };

  private:
    exception_ptr eptr;
    function<R()> task;
    bool done;
    bool failed;
    mutex lock;
    R result;
};

extern class TaskPool pool;

class TaskPool {
  public:
    TaskPool() noexcept: TaskPool(thread::hardware_concurrency() - 1) {
      return;
    };

    TaskPool(const TaskPool &) = delete;
    TaskPool(TaskPool &&) = delete;

    template<typename T>
    void push(PooledTask<T> *task) noexcept {

      lock_guard<mutex> guard(lock);

      builders.push([=] {
        try {
          (*task)();
        } catch(...) {
          // Ignore it here! The task will save it. :)
        };
      });

    };

    ~TaskPool() {
      // TODO: wait for all tasks to finish...
    };
  private:
    queue<thread *> threads;
    queue<function<void()>> builders;
    mutex lock;

    TaskPool(signed N) noexcept {
      while(N --> 0)
        threads.push(new thread([this, N] {
          for(;;) {

            pop_task();

          };
        }));
    };

    void pop_task() noexcept {

      lock.lock();

      if(builders.size()) {

        auto task = builders.front();

        builders.pop();

        lock.unlock();

        task();

      } else
        lock.unlock();
    };

} pool;


template<typename R>
PooledTask<R>::PooledTask(function<R()> fun):
  task(fun),
  done(false),
  failed(false)
{
  pool.push(this);
};

// Should probably return a std::shared_ptr here...
template<typename F, typename... Args>
auto byfuture(F fun, Args&&... args) noexcept ->
  PooledTask<decltype(fun(args...))> *
{

  using R = decltype(fun(args...));

  auto pooled = new PooledTask<R> {
    bind(fun, forward<Args>(args)...)
  };

  return pooled;
};


//------------------------------------------------------------------------------
#include <map>

// Get the current thread id as a simple number
static int myid() noexcept {
  static unsigned N = 0;
  static map<thread::id, unsigned> hash;
  static mutex lock;

  lock_guard<mutex> guard(lock);

  auto current = this_thread::get_id();

  if(!hash[current])
    hash[current] = ++N;

  return hash[current];
};
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------
// The fibonacci test implementation
int future_fib(int x, int parent) {

  if(x < 3)
    return 1;

  print_("future_fib(", x, ")", " on thread ", myid(), \
         ", asked by thread ", parent, "\n");

  auto f1 = byfuture(future_fib, x - 1, myid());
  auto f2 = byfuture(future_fib, x - 2, myid());

  auto res = (*f1)() + (*f2)();

  delete f1;
  delete f2;

  return res;
};
//------------------------------------------------------------------------------

int main() {
  // Force main thread to get id 1
  myid();

  // Get task
  auto f = byfuture(future_fib, 8, myid());

  // Make sure it starts on the task pool
  this_thread::sleep_for(seconds(1));

  // Blocks
  (*f)();

  // Simply wait to be sure all threads are clean
  this_thread::sleep_for(seconds(2));

  //
  return EXIT_SUCCESS;
};

这个程序的结果是这样的（我有一个四核，所以池中有3个线程）：

future_fib(8) on thread 2, asked by thread 1
future_fib(7) on thread 3, asked by thread 2
future_fib(6) on thread 4, asked by thread 2
future_fib(6) on thread 3, asked by thread 3
future_fib(5) on thread 4, asked by thread 4
future_fib(5) on thread 3, asked by thread 3
future_fib(4) on thread 4, asked by thread 4
future_fib(4) on thread 3, asked by thread 3
future_fib(3) on thread 4, asked by thread 4
future_fib(3) on thread 3, asked by thread 3
future_fib(3) on thread 4, asked by thread 4
future_fib(3) on thread 3, asked by thread 3
future_fib(4) on thread 4, asked by thread 4
future_fib(4) on thread 3, asked by thread 3
future_fib(3) on thread 4, asked by thread 4
future_fib(3) on thread 3, asked by thread 3
future_fib(5) on thread 3, asked by thread 3
future_fib(4) on thread 3, asked by thread 3
future_fib(3) on thread 3, asked by thread 3
future_fib(3) on thread 3, asked by thread 3

与普通的斐波纳契函数相比，这种实现方式非常缓慢。

所以这里的问题是：当池运行fib(8)时，它将创建两个将在下一个线程上运行的任务，但是当它到达auto res = (*f1)() + (*f2)();时，两个任务都已在运行，所以它将阻止f1（在线程3上运行）。

为了提高速度，我需要做的是针对线程2，而不是阻塞f1，假设执行任何线程3正在做的事情，让它准备好接受另一个任务，所以没有线程我正在睡觉做计算。

这篇文章http://bartoszmilewski.com/2011/10/10/async-tasks-in-c11-not-quite-there-yet/说我有必要做我想做的事，但没有具体说明。

我的疑问是：我怎么可能这样做？

还有其他选择可以做我想要的吗？

Answer 1

我想你可能有resumable functions currently proposed for C++ standartization的机会。该提案尚未获得批准，但Visual Studio 15 CTP实现了该提案，因此您可以尝试制作原型（如果您可以使用MSVC编译器）。

Gor Nishanov（最新提案文件的作者之一）描述了一个非常类似的例子，计算斐波纳契的“父母偷窃时间表”，从23:47开始，在他的CppCon谈话中：https://www.youtube.com/watch?v=KUhSjfSbINE

但请注意，我找不到spawnable<T>实施的任何来源/样本，因此您可能需要与提案作者联系以获取详细信息。

Answer 2

看看你的代码中充满了比计算fib 8更长的事情。

例如，切换到内核空间以找出线程ID在大多数版本的窗口上可能需要的时间比在这里完成的工作要长。

并行化不是要让一堆线程竞争共享内存。这是你可能犯的最大错误。

当您并行化任务时，您将输出分解为离散的块，以便并行线程每个都写入自己的内存，并避免内存和缓存争用，这会使应用程序陷入困境。

当你有3个线程触摸3个独立的内存位置时，不需要使用Lock或其他同步原语。在大多数Windows版本中，还需要内核模式切换。

所以你真正需要知道的唯一事情就是线程全部完成。这可以通过许多Interlocked Exchange方法或OS驱动的事件句柄来实现。

如果您想成为一名认真的开发人员，请删除主题ID，删除锁定代码，然后开始考虑如何在没有它们的情况下解决此问题。

想想2车道高速公路上的2辆车。一个比另一个移动得快。而你永远不知道哪辆车领先于另一辆。问问自己是否有一些方法可以将这些车放置在2个车道中，哪个位于前方并且哪个车速更快？你应该得出结论，如果每辆车都停留在自己的车道上，那么永远不会有问题。这是最简单的并行化。

现在考虑一下，您将在不同大陆的不同机器上生成这些作业。尝试交换有关线程和内存的信息是否合理？不，这不对。你只是简单地将问题分解为离散的功能块，它们彼此之间绝对没有任何关系，忘记过度控制，让信息时代的魔力发生。

我希望这会有所帮助。

将执行从一个线程移动到另一个线程以实现任务并行性和逐个调用

2 个答案: