当使用8个生产者1消费者进行测试时,为什么golang channel比intel tbb concurrent_queue要快得多

时间:2017-05-24 02:55:12

标签: c++ performance go concurrency tbb

我做了一个比较golang频道和C ++ tbb并发队列性能的测试,我设置了8个编写器和1个不同线程的阅读器。结果显示golang比C ++版本快得多(无论延迟和整体发送/恢复速度如何),这是真的吗?或者我的代码中有任何错误?

golang结果,单位为微秒

延迟最大值:1505,平均值:1073 发送开始:1495593677683232,recv结束:1495593677901854,时间:218622

package main

import (
    "flag"
    "time"
    "fmt"
    "sync"
    "runtime"
)

var (
    producer = flag.Int("producer", 8, "producer")
    consumer = flag.Int("consumer", 1, "consumer")
    start_signal sync.WaitGroup
)

const (
    TEST_NUM = 1000000
)

type Item struct  {
    id int
    sendtime int64
    recvtime int64
}

var g_vec[TEST_NUM] Item

func sender(out chan int, begin int, end int) {
    start_signal.Wait()
    runtime.LockOSThread()
    println("i am in sender", begin, end)
    for i:=begin; i < end; i++ {
        item := &g_vec[i]
        item.id = i
        item.sendtime = time.Now().UnixNano()/1000
        out<- i
    }
    println("sender finish")
}

func reader(out chan int, total int) {
    //runtime.LockOSThread()
    start_signal.Done()
    for i:=0; i<total;i++ {
        tmp :=<- out
        item := &g_vec[tmp]
        item.recvtime = time.Now().UnixNano()/1000
    }
    var lsum int64 = 0
    var lavg int64 = 0
    var lmax int64 = 0
    var lstart int64 = 0
    var lend int64 = 0
    for _, item:= range g_vec {
        if lstart > item.sendtime || lstart == 0 {
           lstart = item.sendtime
        }
        if lend < item.recvtime {
            lend = item.recvtime
        }

        ltmp := item.recvtime - item.sendtime
        lsum += ltmp
        if  ltmp > lmax {
            lmax = ltmp
        }
    }
    lavg = lsum /  TEST_NUM
    fmt.Printf("latency max:%v,avg:%v\n", lmax, lavg)
    fmt.Printf("send begin:%v,recv end:%v, time:%v", lstart, lend, lend-lstart)
}

func main() {
    runtime.GOMAXPROCS(10)
    out := make (chan int,5000)

    start_signal.Add(1)
    for i:=0 ;i<*producer;i++ {
        go sender(out,i*TEST_NUM/(*producer), (i+1)*TEST_NUM/(*producer))
    }
    reader(out, TEST_NUM)
}

C ++,只有主要部分

concurrent_bounded_queue g_queue; 最大值:558301,最小值:3,平均值:403741(单位为微秒) 开始:1495594232068580,端:1495594233497618,长度:1429038

static void sender(int start, int end)
{   
    for (int i=start; i < end; i++)
    {
        using namespace std::chrono;
        auto now = system_clock::now();
        auto now_ms = time_point_cast<microseconds>(now);
        auto value = now_ms.time_since_epoch();
        int64_t duration = value.count();

        Item &item = g_pvec->at(i);
        item.id = i;
        item.sendTime = duration;
        //std::cout << "sending " << i << "\n";
        g_queue.push(i);
    }
}


static void reader(int num)
{
    barrier.set_value();    
    for (int i=0;i<num;i++)
    {
        int v;
        g_queue.pop(v);
        Item &el = g_pvec->at(v);

        using namespace std::chrono;
        auto now = system_clock::now();
        auto now_ms = time_point_cast<microseconds>(now);
        auto value = now_ms.time_since_epoch();
        int64_t duration = value.count();

        el.recvTime = duration;
        //std::cout << "recv " << item.id << ":" << duration << "\n";
    }
    // caculate the result.
    int64_t lmax = 0;
    int64_t lmin = 100000000;
    int64_t lavg = 0;
    int64_t lsum = 0;
    int64_t lbegin = 0;
    int64_t lend = 0;
    for (auto &item : *g_pvec)
    {
        if (item.sendTime<lbegin || lbegin==0)
        {
            lbegin = item.sendTime;
        }
        if (item.recvTime>lend )
        {
            lend = item.recvTime;
        }

        lsum += item.recvTime - item.sendTime;
        lmax = max(item.recvTime - item.sendTime, lmax);
        lmin = min(item.recvTime - item.sendTime, lmin);
    }
    lavg = lsum / num;
    std::cout << "max:" << lmax << ",min:" << lmin << ",avg:" << lavg << "\n";
    std::cout << "start:" << lbegin << ",end:" << lend << ",length:" << lend-lbegin << "\n";
}

DEFINE_CODE_TEST(plain_queue_test)
{
    g_pvec = new std::vector<Item>();
    g_pvec->resize(TEST_NUM);    

    auto sf = barrier.get_future().share();

    std::vector<std::thread> vt;
    for (int i = 0; i < SENDER_NUM; i++)
    {
        vt.emplace_back([sf, i]{    
            sf.wait();
            sender(i*TEST_NUM / SENDER_NUM, (i + 1)*TEST_NUM / SENDER_NUM);
        });
    }


    std::cout << "create reader\n";
    std::thread rt(bind(reader, TEST_NUM));
    for (auto& t : vt)
    {
        t.join();
    }    
    rt.join();
}

golang cpu usage

cpp cpu usage

来自vtune cpu图的

(红色表示cpu旋转/开销,绿色空闲)我觉得golang频道有一个更高效的互斥锁(例如,是否需要系统调用才能睡眠goroutine vs C ++互斥?) enter image description here

enter image description here

1 个答案:

答案 0 :(得分:4)

从VTune跟踪我可以得出结论TBB队列没有睡眠,花费很多时间旋转,而Go版本有浅绿色区域表示线程在OS同步时休眠。为什么更好?通常,它表示您的计算机上有超额订阅,因此通过操作系统进行的通信可以获得回报。 那么,你有超额认购吗?如果是的话,我会说这是相当符合相应图书馆哲学的预期行为。 TBB专为计算并行性而设计,在与超额预订相比时,它不能很好地处理IO任务。 Go完全针对IO任务而设计,因此内置并发使用调度程序的FIFO策略,这对并行数字运算不友好。对于IO任务,建议使用超额预订,同时影响甚至杀死计算并行性。