我正在测试几种基本互斥锁实现的性能,以试图了解哪种解决方案最有效。 操作很简单,5个整数的向量不断被4个线程覆盖,一个加法,一个减法,另外两个做相同但反向顺序。
我在解释结果方面遇到了一些麻烦:
整个工作负载的序列化单线程为3.3秒
2.1如果我为每个单独的向量槽使用一个锁(所以5个锁)
1.6如果我只使用一个大锁来写入矢量访问
0.33如果我不使用锁(这自然会产生不好的结果)
单个锁不应该比仅使用一个大锁更快吗?
根据要求:
#include <iostream>
#include <thread>
#include <string>
#include <chrono>
#include <mutex>
#include <vector>
#include <algorithm>
//Guard automatically encapsulates a join so that upper layer operations aren't compromised and the main programmer doesn't need to worry about joining on every exit
class Guard {
std::thread& t;
public:
explicit Guard(std::thread & t_) : t(t_) {};
~Guard() {
if (t.joinable()) {
t.join(); //join waits so no need to check timers and such
}
}
Guard(Guard const &) = delete; //prevents automatic reassignment
Guard & operator = (Guard const &) = delete; //prevents copying
};
void idfunc(int x, std::string input) {
std::cout << input << x << std::hex << std::this_thread::get_id() << std::endl;
}
#define CONSTOP 1000000
#define SIZEBUFFER 5
bool s1, s2, s3, s4 = false;
float d1, d2, d3, d4 = 0.0f;
std::vector<std::mutex> mm(SIZEBUFFER);
std::mutex singlelock;
void fs_up(std::vector<int>& input) {
auto start = std::chrono::system_clock::now();
for (int i = 0; i < CONSTOP; i++) {
singlelock.lock();
input[i%SIZEBUFFER]++;
singlelock.unlock();
}
s1 = true;
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> diff = end - start;
d1 = diff.count();
}
void fs_down(std::vector<int>& input) {
auto start = std::chrono::system_clock::now();
for (int i = 0; i < CONSTOP; i++) {
singlelock.lock();
input[i%SIZEBUFFER]--;
singlelock.unlock();
}
s2 = true;
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> diff = end - start;
d2 = diff.count();
}
void fs_downright(std::vector<int>& input) {
auto start = std::chrono::system_clock::now();
for (int i = CONSTOP - 1; i >= 0; i--) {
singlelock.lock();
input[i%SIZEBUFFER]--;
singlelock.unlock();
}
s3 = true;
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> diff = end - start;
d3 = diff.count();
}
void fs_upright(std::vector<int>& input) {
auto start = std::chrono::system_clock::now();
for (int i = CONSTOP - 1; i >= 0; i--) {
singlelock.lock();
input[i%SIZEBUFFER]++;
singlelock.unlock();
}
s4 = true;
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> diff = end - start;
d4 = diff.count();
}
void f_upno(std::vector<int>& input) {
auto start = std::chrono::system_clock::now();
for (int i = 0; i < CONSTOP; i++) {
input[i%SIZEBUFFER]++;
}
s1 = true;
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> diff = end - start;
d1 = diff.count();
}
void f_downno(std::vector<int>& input) {
auto start = std::chrono::system_clock::now();
for (int i = 0; i < CONSTOP; i++) {
input[i%SIZEBUFFER]--;
}
s2 = true;
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> diff = end - start;
d2 = diff.count();
}
void f_downrightno(std::vector<int>& input) {
auto start = std::chrono::system_clock::now();
for (int i = CONSTOP - 1; i >= 0; i--) {
input[i%SIZEBUFFER]--;
}
s3 = true;
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> diff = end - start;
d3 = diff.count();
}
void f_uprightno(std::vector<int>& input) {
auto start = std::chrono::system_clock::now();
for (int i = CONSTOP - 1; i >= 0; i--) {
input[i%SIZEBUFFER]++;
}
s4 = true;
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> diff = end - start;
d4 = diff.count();
}
void f_up(std::vector<int>& input) {
auto start = std::chrono::system_clock::now();
for (int i = 0; i < CONSTOP; i++){
mm[i%SIZEBUFFER].lock();
input[i%SIZEBUFFER]++;
mm[i%SIZEBUFFER].unlock();
}
s1 = true;
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> diff = end - start;
d1 = diff.count();
}
void f_down(std::vector<int>& input) {
auto start = std::chrono::system_clock::now();
for (int i = 0; i < CONSTOP; i++) {
mm[i%SIZEBUFFER].lock();
input[i%SIZEBUFFER]--;
mm[i%SIZEBUFFER].unlock();
}
s2 = true;
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> diff = end - start;
d2 = diff.count();
}
void f_downright(std::vector<int>& input) {
auto start = std::chrono::system_clock::now();
for (int i = CONSTOP - 1; i >= 0; i--) {
mm[i%SIZEBUFFER].lock();
input[i%SIZEBUFFER]--;
mm[i%SIZEBUFFER].unlock();
}
s3 = true;
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> diff = end - start;
d3 = diff.count();
}
void f_upright(std::vector<int>& input) {
auto start = std::chrono::system_clock::now();
for (int i = CONSTOP - 1 ; i >= 0; i--) {
mm[i%SIZEBUFFER].lock();
input[i%SIZEBUFFER]++;
mm[i%SIZEBUFFER].unlock();
}
s4 = true;
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> diff = end - start;
d4 = diff.count();
}
int main()
{
std::vector<int> buffer(SIZEBUFFER, 0);
auto start = std::chrono::system_clock::now();
f_up(buffer);
f_down(buffer);
f_downright(buffer);
f_upright(buffer);
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> diff = end - start;
std::cout << "Benchmark is: " << diff.count() << std::endl;
int num = std::thread::hardware_concurrency();
/*for (int i = 0; i < num; i++) {
std::thread t(idfunc, 0, "ThreadID is: ");
Guard g(t);
//code safe from here on out
}*/
std::thread t1(f_up, std::ref(buffer));
Guard* g1 = new Guard(t1);
std::thread t2(f_down, std::ref(buffer));
Guard* g2 = new Guard(t2);
std::thread t3(f_downright, std::ref(buffer));
Guard* g3 = new Guard(t3);
std::thread t4(f_upright, std::ref(buffer));
Guard* g4 = new Guard(t4);
while (true) {
break; // to reuse if main thread is supposed to do something besides waiting
std::this_thread::sleep_for(std::chrono::milliseconds(1));
if (s1 && s2 && s3 && s4) {
break;
}
}
delete g1;
delete g2;
delete g3;
delete g4;
std::cout << "Individual Locks Execution lasted: " << std::max({ d1,d2,d3,d4 }) << "(" << d1 << " " << d2 << " " << d3 << " " << d4 << ")" << std::endl;
std::this_thread::sleep_for(std::chrono::seconds(1));
for (auto cell : buffer) {
std::cout << std::dec << cell << std::endl;
}
std::thread t11(fs_up, std::ref(buffer));
Guard* g11 = new Guard(t11);
std::thread t12(fs_down, std::ref(buffer));
Guard* g12 = new Guard(t12);
std::thread t13(fs_downright, std::ref(buffer));
Guard* g13 = new Guard(t13);
std::thread t14(fs_upright, std::ref(buffer));
Guard* g14 = new Guard(t14);
while (true) {
break; // to reuse if main thread is supposed to do something besides waiting
std::this_thread::sleep_for(std::chrono::milliseconds(1));
if (s1 && s2 && s3 && s4) {
break;
}
}
delete g11;
delete g12;
delete g13;
delete g14;
std::cout << "One Lock Execution lasted: " << std::max({ d1,d2,d3,d4 }) << "(" << d1 << " " << d2 << " " << d3 << " " << d4 << ")" << std::endl;
std::this_thread::sleep_for(std::chrono::seconds(1));
for (auto cell : buffer) {
std::cout << std::dec << cell << std::endl;
}
std::thread tn1(f_upno, std::ref(buffer));
Guard* gn1 = new Guard(tn1);
std::thread tn2(f_downno, std::ref(buffer));
Guard* gn2 = new Guard(tn2);
std::thread tn3(f_downrightno, std::ref(buffer));
Guard* gn3 = new Guard(tn3);
std::thread tn4(f_uprightno, std::ref(buffer));
Guard* gn4 = new Guard(tn4);
while (true) {
break; // to reuse if main thread is supposed to do something besides waiting
std::this_thread::sleep_for(std::chrono::milliseconds(1));
if (s1 && s2 && s3 && s4) {
break;
}
}
delete gn1;
delete gn2;
delete gn3;
delete gn4;
std::cout << "No Sync Execution lasted: " << std::max({ d1,d2,d3,d4 }) << "(" << d1 << " " << d2 << " " << d3 << " " << d4 << ")" << std::endl;
std::this_thread::sleep_for(std::chrono::seconds(1));
for (auto cell : buffer) {
std::cout << std::dec << cell << std::endl;
}
std::this_thread::sleep_for(std::chrono::seconds(5));
}
基本上我为每个按顺序运行的测试创建了函数。删除所有线程并为每个阶段重新创建。我正在使用VS2017提供的Microsoft C / C ++编译器,因此至少支持c ++ 11。我没有更改任何默认编译行,但我注意到优化已关闭。无论如何所有的编译器选项:
/ permissive- / GS / analyze- / W3 / Zc:wchar_t / ZI / Gm / Od / sdl /Fd"Debug\vc141.pdb“/ Zc:inline / fp:precise / D”_MBCS“/ errorReport: prompt / WX- / Zc:forScope / RTC1 / Gd / Oy- / MDd / Fa“Debug \”/ EHsc / nologo / Fo“Debug \”/Fp"Debug\ThreadingTester.pch“/ diagnostics:classic
答案 0 :(得分:4)
锁定和解锁互斥锁的开销可能远远大于在int
上执行简单算术运算的成本,这使得单个锁定案例几乎可以衡量互斥开销。另一种观察方式,使用这种方法的同步成本大于节省并行化的时间。
由于您在单个锁定案例中几乎没有时间在锁定之外,所以除了一个工作者之外的所有工作都在等待锁定,这意味着结果大约是单线程方法的总和,加上互斥开销。
在测量表演时很难想出代表性的模型。什么构成“最佳解决方案”取决于许多因素。在一个案例中最好的可能在另一个看似相似的案例中并不理想。最好衡量实际应用。
答案 1 :(得分:1)
除了接受的答案,请查看您的评论
无螺纹0.16 / 5锁0.18 / 1锁0.14 /无锁0.04
和你的代码,&#34;没有线程&#34;并且&#34; 1锁定&#34;预计会给出相同的结果。这个0.2差异与你的5个锁相同。
鉴于值为ms,这可能与您计算机上发生的其他事情有关,毕竟还有更多线程要求操作系统获取CPU时间和内存带宽。
0.2ms的差异也可能与CPU缓存以及我无法立即想到的一系列其他事情有关,但我强调的是测试的错误是未知的;和所有统计数据一样,衡量它是一件很重要的事情。