我有这么简单的程序,它在8个线程中将vector的每个元素递增1(我的PC上有8个核心)。但它只加速程序2.8次,我做错了什么或多线程工作不是那么快?
成本为599毫秒
8亿个
成本是1697毫秒
8亿个
#include <iostream>
#include <thread>
#include <vector>
void test(int n, int k)
{
std::vector<int> data(n * k, 0);
std::vector<std::thread> threads(n);
auto functor = [] (int *begin, int *end) {
for (int *p = begin; p != end; p++) {
*p = *p + 1;
}
};
auto begin = std::chrono::steady_clock::now();
for (int i = 0; i < n; i++) {
threads[i] = std::thread(std::bind(functor, data.data() + i * k, data.data() + (i + 1) * k));
}
for (int i = 0; i < n; i++) {
threads[i].join();
}
auto now = std::chrono::steady_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - begin);
std::cout << "Cost is " << elapsed .count() << " milliseconds" << std::endl;
int sum = 0;
for (int i = 0; i < n * k; i++) {
sum += data[i];
}
std::cerr << sum << std::endl;
}
void stupid_test(int n, int k)
{
std::vector<int> data(n * k, 0);
std::vector<std::thread> threads(n);
auto functor = [] (int *begin, int *end) {
for (int *p = begin; p != end; p++) {
*p = *p + 1;
}
};
auto begin = std::chrono::steady_clock::now();
for (int i = 0; i < n; i++) {
functor(data.data() + i * k, data.data() + (i + 1) * k);
}
auto now = std::chrono::steady_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - begin);
std::cout << "Cost is " << elapsed .count() << " milliseconds" << std::endl;
int sum = 0;
for (int i = 0; i < n * k; i++) {
sum += data[i];
}
std::cerr << sum << std::endl;
}
int main()
{
test(8, 100000000);
stupid_test(8, 100000000);
return 0;
}
答案 0 :(得分:1)
您的&#34; 8核心&#34; 8个执行单元可能是4个超线程核心。最大约为5.2x(4核,超线程最大加速约30%)。如果你只运行4个线程,速度可能不会发生太大变化。
由于某些CPU资源(缓存)和内存在所有内核之间共享,因此这里的限制速度是来自8个执行线程的线性内存访问速度。即使只做2个线程也不会给你2倍的加速。