选择锁粒度

时间:2018-01-30 15:10:10

标签: c++ multithreading performance locking mutex

我正在测试几种基本互斥锁实现的性能,以试图了解哪种解决方案最有效。 操作很简单,5个整数的向量不断被4个线程覆盖,一个加法,一个减法,另外两个做相同但反向顺序。

我在解释结果方面遇到了一些麻烦:

整个工作负载的序列化单线程为3.3秒

2.1如果我为每个单独的向量槽使用一个锁(所以5个锁)

1.6如果我只使用一个大锁来写入矢量访问

0.33如果我不使用锁(这自然会产生不好的结果)

单个锁不应该比仅使用一个大锁更快吗?

根据要求:

#include <iostream>
#include <thread>
#include <string>
#include <chrono>
#include <mutex>
#include <vector>
#include <algorithm>

//Guard automatically encapsulates a join so that upper layer operations aren't compromised and the main programmer doesn't need to worry about joining on every exit

class Guard {
    std::thread& t;
public:
    explicit Guard(std::thread & t_) : t(t_) {};
    ~Guard() {
        if (t.joinable()) {
            t.join();                              //join waits so no need to check timers and such
        }
    }
    Guard(Guard const &) = delete;                 //prevents automatic reassignment
    Guard & operator = (Guard const &) = delete;   //prevents copying

};

void idfunc(int x, std::string input) {

    std::cout << input << x << std::hex << std::this_thread::get_id() << std::endl;
}

#define CONSTOP 1000000
#define SIZEBUFFER 5
bool s1, s2, s3, s4 = false;
float d1, d2, d3, d4 = 0.0f;
std::vector<std::mutex> mm(SIZEBUFFER);
std::mutex singlelock;

void fs_up(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = 0; i < CONSTOP; i++) {
        singlelock.lock();
        input[i%SIZEBUFFER]++;
        singlelock.unlock();
    }
    s1 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d1 = diff.count();
}


void fs_down(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = 0; i < CONSTOP; i++) {
        singlelock.lock();
        input[i%SIZEBUFFER]--;
        singlelock.unlock();
    }
    s2 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d2 = diff.count();
}

void fs_downright(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = CONSTOP - 1; i >= 0; i--) {
        singlelock.lock();
        input[i%SIZEBUFFER]--;
        singlelock.unlock();
    }
    s3 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d3 = diff.count();
}

void fs_upright(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = CONSTOP - 1; i >= 0; i--) {
        singlelock.lock();
        input[i%SIZEBUFFER]++;
        singlelock.unlock();
    }
    s4 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d4 = diff.count();
}

void f_upno(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = 0; i < CONSTOP; i++) {
        input[i%SIZEBUFFER]++;
    }
    s1 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d1 = diff.count();
}


void f_downno(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = 0; i < CONSTOP; i++) {
        input[i%SIZEBUFFER]--;
    }
    s2 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d2 = diff.count();
}

void f_downrightno(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = CONSTOP - 1; i >= 0; i--) {
        input[i%SIZEBUFFER]--;
    }
    s3 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d3 = diff.count();
}

void f_uprightno(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = CONSTOP - 1; i >= 0; i--) {
        input[i%SIZEBUFFER]++;
    }
    s4 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d4 = diff.count();
}



void f_up(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = 0; i < CONSTOP; i++){
        mm[i%SIZEBUFFER].lock();
        input[i%SIZEBUFFER]++;
        mm[i%SIZEBUFFER].unlock();
    }
    s1 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d1 = diff.count();
}


void f_down(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = 0; i < CONSTOP; i++) {
        mm[i%SIZEBUFFER].lock();
        input[i%SIZEBUFFER]--;
        mm[i%SIZEBUFFER].unlock();
    }
    s2 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d2 = diff.count();
}

void f_downright(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = CONSTOP - 1; i >= 0; i--) {
        mm[i%SIZEBUFFER].lock();
        input[i%SIZEBUFFER]--;
        mm[i%SIZEBUFFER].unlock();
    }
    s3 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d3 = diff.count();
}

void f_upright(std::vector<int>& input) {

    auto start = std::chrono::system_clock::now();
    for (int i = CONSTOP - 1 ; i >= 0; i--) {
        mm[i%SIZEBUFFER].lock();
        input[i%SIZEBUFFER]++;
        mm[i%SIZEBUFFER].unlock();
    }
    s4 = true;
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    d4 = diff.count();
}

int main()
{
    std::vector<int> buffer(SIZEBUFFER, 0);
    auto start = std::chrono::system_clock::now();

    f_up(buffer);
    f_down(buffer);
    f_downright(buffer);
    f_upright(buffer);



    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = end - start;
    std::cout << "Benchmark is: " << diff.count() << std::endl;

    int num = std::thread::hardware_concurrency();

    /*for (int i = 0; i < num; i++) {
        std::thread t(idfunc, 0, "ThreadID is: ");
        Guard g(t);
        //code safe from here on out
    }*/
    std::thread t1(f_up, std::ref(buffer));
    Guard* g1  = new Guard(t1);
    std::thread t2(f_down, std::ref(buffer));
    Guard* g2 = new Guard(t2);
    std::thread t3(f_downright, std::ref(buffer));
    Guard* g3 = new Guard(t3);
    std::thread t4(f_upright, std::ref(buffer));
    Guard* g4 = new Guard(t4);


    while (true) {  
        break;          // to reuse if main thread is supposed to do something besides waiting
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
        if (s1 && s2 && s3 && s4) {  
            break;
        }
    }


    delete g1;
    delete g2;
    delete g3;
    delete g4;

    std::cout << "Individual Locks Execution lasted: " << std::max({ d1,d2,d3,d4 }) << "(" << d1 << " " << d2 << " " << d3 << " " << d4 << ")" << std::endl;
    std::this_thread::sleep_for(std::chrono::seconds(1));
    for (auto cell : buffer) {
        std::cout << std::dec << cell << std::endl;
    }



    std::thread t11(fs_up, std::ref(buffer));
    Guard* g11 = new Guard(t11);
    std::thread t12(fs_down, std::ref(buffer));
    Guard* g12 = new Guard(t12);
    std::thread t13(fs_downright, std::ref(buffer));
    Guard* g13 = new Guard(t13);
    std::thread t14(fs_upright, std::ref(buffer));
    Guard* g14 = new Guard(t14);



    while (true) {
        break;          // to reuse if main thread is supposed to do something besides waiting
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
        if (s1 && s2 && s3 && s4) {
            break;
        }
    }


    delete g11;
    delete g12;
    delete g13;
    delete g14;

    std::cout << "One Lock Execution lasted: " << std::max({ d1,d2,d3,d4 }) << "(" << d1 << " " << d2 << " " << d3 << " " << d4 << ")" << std::endl;

    std::this_thread::sleep_for(std::chrono::seconds(1));

    for (auto cell : buffer) {
        std::cout << std::dec << cell << std::endl;
    }


    std::thread tn1(f_upno, std::ref(buffer));
    Guard* gn1 = new Guard(tn1);
    std::thread tn2(f_downno, std::ref(buffer));
    Guard* gn2 = new Guard(tn2);
    std::thread tn3(f_downrightno, std::ref(buffer));
    Guard* gn3 = new Guard(tn3);
    std::thread tn4(f_uprightno, std::ref(buffer));
    Guard* gn4 = new Guard(tn4);



    while (true) {
        break;          // to reuse if main thread is supposed to do something besides waiting
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
        if (s1 && s2 && s3 && s4) {
            break;
        }
    }


    delete gn1;
    delete gn2;
    delete gn3;
    delete gn4;

    std::cout << "No Sync Execution lasted: " << std::max({ d1,d2,d3,d4 }) << "(" << d1 << " " << d2 << " " << d3 << " " << d4 << ")" << std::endl;
    std::this_thread::sleep_for(std::chrono::seconds(1));
    for (auto cell : buffer) {
        std::cout << std::dec << cell << std::endl;
    }


    std::this_thread::sleep_for(std::chrono::seconds(5));

}

基本上我为每个按顺序运行的测试创建了函数。删除所有线程并为每个阶段重新创建。我正在使用VS2017提供的Microsoft C / C ++编译器,因此至少支持c ++ 11。我没有更改任何默认编译行,但我注意到优化已关闭。无论如何所有的编译器选项:

/ permissive- / GS / analyze- / W3 / Zc:wchar_t / ZI / Gm / Od / sdl /Fd"Debug\vc141.pdb“/ Zc:inline / fp:precise / D”_MBCS“/ errorReport: prompt / WX- / Zc:forScope / RTC1 / Gd / Oy- / MDd / Fa“Debug \”/ EHsc / nologo / Fo“Debug \”/Fp"Debug\ThreadingTester.pch“/ diagnostics:classic

2 个答案:

答案 0 :(得分:4)

锁定和解锁互斥锁的开销可能远远大于在int上执行简单算术运算的成本,这使得单个锁定案例几乎可以衡量互斥开销。另一种观察方式,使用这种方法的同步成本大于节省并行化的时间。

由于您在单个锁定案例中几乎没有时间在锁定之外,所以除了一个工作者之外的所有工作都在等待锁定,这意味着结果大约是单线程方法的总和,加上互斥开销。

在测量表演时很难想出代表性的模型。什么构成“最佳解决方案”取决于许多因素。在一个案例中最好的可能在另一个看似相似的案例中并不理想。最好衡量实际应用。

答案 1 :(得分:1)

除了接受的答案,请查看您的评论

  

无螺纹0.16 / 5锁0.18 / 1锁0.14 /无锁0.04

和你的代码,&#34;没有线程&#34;并且&#34; 1锁定&#34;预计会给出相同的结果。这个0.2差异与你的5个锁相同。

鉴于值为ms,这可能与您计算机上发生的其他事情有关,毕竟还有更多线程要求操作系统获取CPU时间和内存带宽。

0.2ms的差异也可能与CPU缓存以及我无法立即想到的一系列其他事情有关,但我强调的是测试的错误是未知的;和所有统计数据一样,衡量它是一件很重要的事情。