我尝试使用gcc / ubuntu在我的8内核计算机上用以下程序测试并行累积和串行累积之间的区别。无论我打开还是关闭优化编译器选项,似乎串行版本总是更快。
class ParalellAccumulator
{
public:
template<typename Iterator, typename T>
struct accumulate_block
{
void operator()(Iterator first, Iterator last, T& result)
{
result = std::accumulate(first, last, result);
}
};
template<typename Iterator, typename T>
static T parallel_accumulate(Iterator first, Iterator last, T init)
{
unsigned long const length = std::distance(first, last);
if (!length)
return init;
unsigned long const min_per_thread = 25;
unsigned long const max_threads = (length + min_per_thread - 1)
/ min_per_thread;
unsigned long const hardware_threads =
std::thread::hardware_concurrency();
unsigned long const num_threads = std::min(
hardware_threads != 0 ? hardware_threads : 2, max_threads);
unsigned long const block_size = length / num_threads;
std::vector<T> results(num_threads);
std::vector<std::thread> threads(num_threads - 1);
Iterator block_start = first;
for (unsigned long i = 0; i < (num_threads - 1); ++i)
{
Iterator block_end = block_start;
std::advance(block_end, block_size);
threads[i] = std::thread(accumulate_block<Iterator, T>(),
block_start, block_end, std::ref(results[i]));
block_start = block_end;
}
accumulate_block<Iterator, T>()(block_start, last,
results[num_threads - 1]);
std::for_each(threads.begin(), threads.end(),
std::mem_fn(&std::thread::join));
return std::accumulate(results.begin(), results.end(), init);
}
static void test()
{
std::vector<double> rawData(100000000);
std::cout << "Accumulating " << rawData.size() << std::endl;
for (size_t i = 0; i < rawData.size(); i++)
{
rawData[i] = (double) rand() / RAND_MAX;
//std::cout << rawData[i] << std::endl;
}
clock_t start = clock();
double result = 0.0;
result = std::accumulate(begin(rawData), end(rawData), result);
long diff = clock() - start;
std::cout << "Serial result: " << result << " in "
<< (double) diff / CLOCKS_PER_SEC << " seconds" << std::endl;
start = clock();
result = 0.0;
result = parallel_accumulate(begin(rawData), end(rawData), result);
diff = clock() - start;
std::cout << "Parallel result: " << result << " in "
<< (double) diff / CLOCKS_PER_SEC << " seconds" << std::endl;
}
};
启用优化选项后,结果为:
Accumulating 100000000
Serial result: 5.00011e+07 in 0.09 seconds
Parallel result: 5.00011e+07 in 0.25 seconds
关闭优化选项后,结果为:
Accumulating 100000000
Serial result: 5.00011e+07 in 0.85 seconds
Parallel result: 5.00011e+07 in 1.51 seconds
可能出现什么问题?