Question

我目前正在进行科学模拟（引力体）。我首先使用朴素的单线程算法编写了该代码，并且该算法对于少量粒子可以令人满意地执行。然后，我对该算法进行了多线程处理（令人尴尬地是并行的），该程序花了大约3倍的时间。以下是具有相似属性并输出到/ tmp中的文件的琐碎算法的最小，完整，可验证的示例（该示例旨在在Linux上运行，但C ++也是标准的）。警告，如果您决定运行此代码，它将生成152.62MB的文件。输出数据是为了防止编译器从程序中优化计算。

#include <iostream>
#include <functional>
#include <thread>
#include <vector>
#include <atomic>
#include <random>
#include <fstream>
#include <chrono>

constexpr unsigned ITERATION_COUNT = 2000;
constexpr unsigned NUMBER_COUNT = 10000;

void runThreaded(unsigned count, unsigned batchSize, std::function<void(unsigned)> callback){
    unsigned threadCount = std::thread::hardware_concurrency();
    std::vector<std::thread> threads;
    threads.reserve(threadCount);

    std::atomic<unsigned> currentIndex(0);

    for(unsigned i=0;i<threadCount;++i){
        threads.emplace_back([&currentIndex, batchSize, count, callback]{
            unsigned startAt = currentIndex.fetch_add(batchSize);

            if(startAt >= count){
                return;
            }else{
                for(unsigned i=0;i<count;++i){
                    unsigned index = startAt+i;
                    if(index >= count){
                        return;
                    }
                    callback(index);
                }
            }
        });
    }

    for(std::thread &thread : threads){
        thread.join();
    }
}

void threadedTest(){
    std::mt19937_64 rnd(0);
    std::vector<double> numbers;

    numbers.reserve(NUMBER_COUNT);
    for(unsigned i=0;i<NUMBER_COUNT;++i){
        numbers.push_back(rnd());
    }

    std::vector<double> newNumbers = numbers;

    std::ofstream fout("/tmp/test-data.bin");

    for(unsigned i=0;i<ITERATION_COUNT;++i) {
        std::cout << "Iteration: " << i << "/" << ITERATION_COUNT << std::endl;
        runThreaded(NUMBER_COUNT, 100, [&numbers, &newNumbers](unsigned x){
            double total = 0;
            for(unsigned y=0;y<NUMBER_COUNT;++y){
                total += numbers[y]*(y-x)*(y-x);
            }
            newNumbers[x] = total;
        });
        fout.write(reinterpret_cast<char*>(newNumbers.data()), newNumbers.size()*sizeof(double));
        std::swap(numbers, newNumbers);
    }
}

void unThreadedTest(){
    std::mt19937_64 rnd(0);
    std::vector<double> numbers;

    numbers.reserve(NUMBER_COUNT);
    for(unsigned i=0;i<NUMBER_COUNT;++i){
        numbers.push_back(rnd());
    }

    std::vector<double> newNumbers = numbers;

    std::ofstream fout("/tmp/test-data.bin");

    for(unsigned i=0;i<ITERATION_COUNT;++i){
        std::cout << "Iteration: " << i << "/" << ITERATION_COUNT << std::endl;
        for(unsigned x=0;x<NUMBER_COUNT;++x){
            double total = 0;
            for(unsigned y=0;y<NUMBER_COUNT;++y){
                total += numbers[y]*(y-x)*(y-x);
            }
            newNumbers[x] = total;
        }
        fout.write(reinterpret_cast<char*>(newNumbers.data()), newNumbers.size()*sizeof(double));
        std::swap(numbers, newNumbers);
    }
}

int main(int argc, char *argv[]) {
    if(argv[1][0] == 't'){
        threadedTest();
    }else{
        unThreadedTest();
    }
    return 0;
}

运行此命令（在Linux上与clang 7.0.1编译）时，从Linux time命令获得以下信息。这些之间的区别类似于我在真实程序中看到的。标记为“真实”的条目与此问题相关，因为这是程序运行所需的时钟时间。

单线程：

real    6m27.261s
user    6m27.081s
sys     0m0.051s

多线程：

real    14m32.856s
user    216m58.063s
sys     0m4.492s

正因为如此，我想知道是什么导致了这种巨大的速度下降，我希望它会显着加快速度（大约有8倍，因为我有8核16线程CPU）。我不会在GPU上实现此功能，因为下一步是对算法进行一些更改，以将其从O（n²）更改为O（nlogn），但这对GPU也不友好。与所包含的示例相比，更改后的算法与我当前实现的O（n²）算法的差异较小。最后，我想观察一下，在线程和非线程运行中，运行每个迭代的主观时间（由出现的迭代行之间的时间来判断）会显着变化。

Answer 1

很难遵循此代码，但是我认为您正在大规模复制工作，因为每个线程几乎完成了所有工作，只是在开始时就跳过了一小部分。

我假设runThreaded的内部循环应该是：

unsigned startAt = currentIndex.fetch_add(batchSize);

while (startAt < count) {
  if (startAt >= count) {
    return;
  } else {
    for(unsigned i=0;i<batchSize;++i){
      unsigned index = startAt+i;

      if(index >= count){
        return;
      }

      callback(index);
    }
  }

  startAt = currentIndex.fetch_add(batchSize);
}

i < batchSize是此处的关键。您应该只执行批次指示的工作，而不是count次，这是整个列表减去初始偏移量的时间。

此更新使代码运行明显更快。我不确定它是否能完成所有必需的工作，因为很难确定这是否真的在发生，输出非常小。

Answer 2

为便于在多个CPU上并行化，我建议使用tbb::parallel_for。它使用正确数量的CPU并为您分配范围，完全消除了错误实现的风险。或者，有一个parallel for_each in C++17。换句话说，这个问题有很多好的解决方案。

向量化代码是一个难题，clang++-6或g++-8都不会自动向量化基线代码。因此，下面的SIMD版本使用了出色的Vc: portable, zero-overhead C++ types for explicitly data-parallel programming库。

下面是一个可比较的工作基准：

基准版本。
SIMD版本。
SIMD +多线程版本。

#include <Vc/Vc>
#include <tbb/parallel_for.h>

#include <algorithm>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <random>
#include <vector>

constexpr int ITERATION_COUNT = 20;
constexpr int NUMBER_COUNT = 20000;

double baseline() {
    double result = 0;

    std::vector<double> newNumbers(NUMBER_COUNT);
    std::vector<double> numbers(NUMBER_COUNT);
    std::mt19937 rnd(0);
    for(auto& n : numbers)
        n = rnd();

    for(int i = 0; i < ITERATION_COUNT; ++i) {
        for(int x = 0; x < NUMBER_COUNT; ++x) {
            double total = 0;
            for(int y = 0; y < NUMBER_COUNT; ++y) {
                auto d = (y - x);
                total += numbers[y] * (d * d);
            }
            newNumbers[x] = total;
        }
        result += std::accumulate(newNumbers.begin(), newNumbers.end(), 0.);
        swap(numbers, newNumbers);
    }

    return result;
}

double simd() {
    double result = 0;

    constexpr int SIMD_NUMBER_COUNT = NUMBER_COUNT / Vc::double_v::Size;
    using vector_double_v = std::vector<Vc::double_v, Vc::Allocator<Vc::double_v>>;
    vector_double_v newNumbers(SIMD_NUMBER_COUNT);
    vector_double_v numbers(SIMD_NUMBER_COUNT);
    std::mt19937 rnd(0);
    for(auto& n : numbers) {
        alignas(Vc::VectorAlignment) double t[Vc::double_v::Size];
        for(double& v : t)
            v = rnd();
        n.load(t, Vc::Aligned);
    }

    Vc::double_v const incv(Vc::double_v::Size);
    for(int i = 0; i < ITERATION_COUNT; ++i) {
        Vc::double_v x(Vc::IndexesFromZero);
        for(auto& new_n : newNumbers) {
            Vc::double_v totals;
            int y = 0;
            for(auto const& n : numbers) {
                for(unsigned j = 0; j < Vc::double_v::Size; ++j) {
                    auto d = y - x;
                    totals += n[j] * (d * d);
                    ++y;
                }
            }
            new_n = totals;
            x += incv;
        }
        result += std::accumulate(newNumbers.begin(), newNumbers.end(), Vc::double_v{}).sum();
        swap(numbers, newNumbers);
    }

    return result;
}

double simd_mt() {
    double result = 0;

    constexpr int SIMD_NUMBER_COUNT = NUMBER_COUNT / Vc::double_v::Size;
    using vector_double_v = std::vector<Vc::double_v, Vc::Allocator<Vc::double_v>>;
    vector_double_v newNumbers(SIMD_NUMBER_COUNT);
    vector_double_v numbers(SIMD_NUMBER_COUNT);
    std::mt19937 rnd(0);
    for(auto& n : numbers) {
        alignas(Vc::VectorAlignment) double t[Vc::double_v::Size];
        for(double& v : t)
            v = rnd();
        n.load(t, Vc::Aligned);
    }

    Vc::double_v const v0123(Vc::IndexesFromZero);
    for(int i = 0; i < ITERATION_COUNT; ++i) {
        constexpr int SIMD_STEP = 4;
        tbb::parallel_for(0, SIMD_NUMBER_COUNT, SIMD_STEP, [&](int ix) {
            Vc::double_v xs[SIMD_STEP];
            for(int is = 0; is < SIMD_STEP; ++is)
                xs[is] = v0123 + (ix + is) * Vc::double_v::Size;
            Vc::double_v totals[SIMD_STEP];
            int y = 0;
            for(auto const& n : numbers) {
                for(unsigned j = 0; j < Vc::double_v::Size; ++j) {
                    for(int is = 0; is < SIMD_STEP; ++is) {
                        auto d = y - xs[is];
                        totals[is] += n[j] * (d * d);
                    }
                    ++y;
                }
            }
            std::copy_n(totals, SIMD_STEP, &newNumbers[ix]);
        });
        result += std::accumulate(newNumbers.begin(), newNumbers.end(), Vc::double_v{}).sum();
        swap(numbers, newNumbers);
    }

    return result;
}

struct Stopwatch {
    using Clock = std::chrono::high_resolution_clock;
    using Seconds = std::chrono::duration<double>;
    Clock::time_point start_ = Clock::now();

    Seconds elapsed() const {
        return std::chrono::duration_cast<Seconds>(Clock::now() - start_);
    }
};


std::ostream& operator<<(std::ostream& s, Stopwatch::Seconds const& a) {
    auto precision = s.precision(9);
    s << std::fixed << a.count() << std::resetiosflags(std::ios_base::floatfield) << 's';
    s.precision(precision);
    return s;
}

void benchmark() {
    Stopwatch::Seconds baseline_time;
    {
        Stopwatch s;
        double result = baseline();
        baseline_time = s.elapsed();
        std::cout << "baseline: " << result << ", " << baseline_time << '\n';
    }

    {
        Stopwatch s;
        double result = simd();
        auto time = s.elapsed();
        std::cout << "    simd: " << result << ", " << time << ", " << (baseline_time / time) << "x speedup\n";
    }

    {
        Stopwatch s;
        double result = simd_mt();
        auto time = s.elapsed();
        std::cout << " simd_mt: " << result << ", " << time << ", " << (baseline_time / time) << "x speedup\n";
    }
}

int main() {
    benchmark();
    benchmark();
    benchmark();
}

时间：

baseline: 2.76582e+257, 6.399848397s
    simd: 2.76582e+257, 1.600373449s, 3.99897x speedup
 simd_mt: 2.76582e+257, 0.168638435s, 37.9501x speedup

注意：

我的机器支持AVX，但不支持AVX-512，因此使用SIMD时的速度大约提高了4倍。
simd_mt版本在我的计算机上使用8个线程和较大的SIMD步骤。理论上，加速比是128倍，实际是38倍。
clang++-6无法自动向量化基准代码，g++-8也不能。
g++-8为SIMD版本生成的代码比clang++-6生成的代码要快得多。

Answer 3

您的心肯定在正确的位置，而不是一两个错误。

par_for是一个复杂的问题，具体取决于循环的有效负载。有没有一个适合所有人的解决方案。有效载荷可以是在几乎无限的互斥锁块中添加了两个-例如通过执行内存分配。

作为工作项模式的原子变量一直对我有效，但是请记住，原子变量在X86上（〜400个周期）的成本很高，甚至如果它们位于我发现要执行的未执行分支中，则会造成很高的成本。

以下的一些排列通常是好的。选择正确的chunks_per_thread（与您的batchSize一样）至关重要。如果你不相信你的用户，您可以测试执行循环的几次迭代以猜测最佳分块级别。

#include <atomic>
#include <future>
#include <thread>
#include <vector>
#include <stdio.h>

template<typename Func>
void par_for(int start, int end, int step, int chunks_per_thread, Func func) {
  using namespace std;
  using namespace chrono;

  atomic<int> work_item{start};
  vector<future<void>> futures(std::thread::hardware_concurrency());

  for (auto &fut : futures) {
    fut = async(std::launch::async, [&work_item, end, step, chunks_per_thread, &func]() {
      for(;;) {
        int wi = work_item.fetch_add(step * chunks_per_thread);
        if (wi > end) break;
        int wi_max = std::min(end, wi+step * chunks_per_thread);
        while (wi < wi_max) {
          func(wi);
          wi += step;
        }
      }
    });
  }

  for (auto &fut : futures) {
    fut.wait();
  }
}

int main() {
  using namespace std;
  using namespace chrono;
  for (int k = 0; k != 2; ++k) {
    auto t0 = high_resolution_clock::now();
    constexpr int loops = 100000000;
    if (k == 0) {
      for (int i = 0; i != loops; ++i ) {
        if (i % 10000000 == 0) printf("%d\n", i);
      }
    } else {
      par_for(0, loops, 1, 100000, [](int i) {
        if (i % 10000000 == 0) printf("%d\n", i);
      });
    }
    auto t1 = high_resolution_clock::now();
    duration<double, milli> ns = t1 - t0;
    printf("k=%d %fms total\n", k, ns.count());
  }
}

结果

...
k=0 174.925903ms total
...
k=1 27.924738ms total

加速6倍。

我避免使用“尴尬地平行”一词，因为几乎从来没有这样。从1级缓存（ns延迟）到全球范围内的群集（ms延迟）的旅程中使用的资源越多，您付出的代价就成倍地增加。但我希望此代码段可以作为答案。

为什么在CPU上执行线程浮点计算会使它们花费更长的时间？

3 个答案: