
时间:2013-08-16 13:15:21

标签: c++ multithreading performance


CWinThread *pMyThread = AfxBeginThread(CMyThreadFunc,&MyData,THREAD_PRIORITY_NORMAL);

通常,我将在64位架构上使用8个内核的32个线程。目前所讨论的过程需要< 1秒,每次刷新显示时都会启动。如果开始和结束一个线程是< 1ms,回报并不能证明这一努力。我在分析这个问题时遇到了一些困难。

A related question here有所帮助但对我所追求的内容有点模糊。任何反馈意见。

3 个答案:

答案 0 :(得分:16)


#include <windows.h>
#include <iostream>
#include <time.h>
#include <vector>

const int num_threads = 32;

const int switches_per_thread = 100000;

DWORD __stdcall ThreadProc(void *start) {
    QueryPerformanceCounter((LARGE_INTEGER *) start);
    for (int i=0;i<switches_per_thread; i++)
    return 0;

int main(void) {
    HANDLE threads[num_threads];
    DWORD junk;

    std::vector<LARGE_INTEGER> start_times(num_threads);


    clock_t create_start = clock();
    for (int i=0;i<num_threads; i++)
        threads[i] = CreateThread(NULL, 
                            (void *)&start_times[i], 
    clock_t create_end = clock();

    clock_t wait_start = clock();
    WaitForMultipleObjects(num_threads, threads, TRUE, INFINITE);
    clock_t wait_end = clock();

    double create_millis = 1000.0 * (create_end - create_start) / CLOCKS_PER_SEC / num_threads;
    std::cout << "Milliseconds to create thread: " << create_millis << "\n";
    double wait_clocks = (wait_end - wait_start);
    double switches = switches_per_thread*num_threads;
    double us_per_switch = wait_clocks/CLOCKS_PER_SEC*1000000/switches;
    std::cout << "Microseconds per thread switch: " << us_per_switch;


    for (auto s : start_times) 
        std::cout << 1000.0 * (s.QuadPart - l.QuadPart) / f.QuadPart <<" ms\n";

    return 0;


Milliseconds to create thread: 0.015625
Microseconds per thread switch: 0.0479687


0.0632517 ms
0.117348 ms
0.143703 ms
0.18282 ms
0.209174 ms
0.232478 ms
0.263826 ms
0.315149 ms
0.324026 ms
0.331516 ms
0.3956 ms
0.408639 ms
0.4214 ms


当我第一次写这篇文章时,我使用的单位更有意义 - 在33 MHz 486上,这些结果并非像这样的微小分数。 :-)我想有一天当我感到雄心勃勃时,我应该重写这个以使用std::async创建线程和std::chrono来做时间,但是......

答案 1 :(得分:3)


  1. 如果您要处理的工作项很多(或者没有太多工作项,但您必须不时地重复整个过程),请确保使用某种线程池。这样你就不必一直重新创建线程了,原来的问题就不再重要了:线程只会被创建一次。我直接使用QueueUserWorkItem API(因为我的应用程序不使用MFC),即使那个也不是太痛苦。但是在MFC中,您可能拥有更高级别的设施来利用线程池。 (http://support.microsoft.com/kb/197728
  2. 尝试为一个工作项选择最佳工作量。当然这取决于你的软件的功能:它应该是实时的,还是在后台运行的数字?如果它不是实时的,那么每个工作项的工作量太少会损害性能:通过增加跨线程的工作分配的开销比例。
  3. 由于硬件配置可能非常不同,如果最终用户可以拥有各种计算机,则可以在软件启动期间异步包含一些校准例程,以便估计某些操作需要多长时间。校准结果可以是稍后用于实际计算的更好工件尺寸设置的输入。

答案 2 :(得分:0)


// Tested on Windows 10 v1903 with E5-1660 v3 @ 3.00GHz, 8 Core(s), 16 Logical Processor(s)
// Times are (min, average, max) in milliseconds.

threads: 100, iterations: 1, testStop: true
Start(0.1083, 5.3665, 13.7103) - Stop(0.0341, 1.5122, 11.0660)

threads: 32, iterations: 3, testStop: true
Start(0.1349, 1.6423, 3.5561) - Stop(0.0396, 0.2877, 3.5195)
Start(0.1093, 1.4992, 3.3982) - Stop(0.0351, 0.2734, 2.0384)
Start(0.1159, 1.5345, 3.5754) - Stop(0.0378, 0.4938, 3.2216)

threads: 4, iterations: 3, testStop: true
Start(0.2066, 0.3553, 0.4598) - Stop(0.0410, 0.1534, 0.4630)
Start(0.2769, 0.3740, 0.4994) - Stop(0.0414, 0.1028, 0.2581)
Start(0.2342, 0.3602, 0.5650) - Stop(0.0497, 0.2199, 0.3620)

threads: 4, iterations: 3, testStop: false
Start(0.1698, 0.2492, 0.3713)
Start(0.1473, 0.2477, 0.4103)
Start(0.1756, 0.2909, 0.4295)

threads: 1, iterations: 10, testStop: false
Start(0.1910, 0.1910, 0.1910)
Start(0.1685, 0.1685, 0.1685)
Start(0.1564, 0.1564, 0.1564)
Start(0.1504, 0.1504, 0.1504)
Start(0.1389, 0.1389, 0.1389)
Start(0.1234, 0.1234, 0.1234)
Start(0.1550, 0.1550, 0.1550)
Start(0.2800, 0.2800, 0.2800)
Start(0.1587, 0.1587, 0.1587)
Start(0.1877, 0.1877, 0.1877)


#include <windows.h>
#include <iostream>
#include <vector>
#include <chrono>
#include <iomanip>

using namespace std::chrono;

struct Test
    HANDLE Thread = { 0 };
    time_point<steady_clock> Creation;
    time_point<steady_clock> Started;
    time_point<steady_clock> Stopped;

DWORD __stdcall ThreadProc(void* lpParamater) {
    auto test = (Test*)lpParamater;
    test->Started = steady_clock::now();
    return 0;

DWORD __stdcall TestThreadsEnded(void* lpParamater) {
    auto& tests = *(std::vector<Test>*)lpParamater;

    std::size_t finished = 0;
    while (finished < tests.size())
        for (auto& test : tests)
            if (test.Thread != NULL && WaitForSingleObject(test.Thread, 0) == WAIT_OBJECT_0)
                test.Stopped = steady_clock::now();
                test.Thread = NULL;

    return 0;

duration<double, std::milli> diff(time_point<steady_clock> start, time_point<steady_clock> stop)
    return stop - start;

struct Stats
    double min;
    double average;
    double max;

Stats stats(const std::vector<double>& durations)
    Stats stats = { 1000, 0, 0 };

    for (auto& duration : durations)
        stats.min = duration < stats.min ? duration : stats.min;
        stats.max = duration > stats.max ? duration : stats.max;
        stats.average += duration;

    stats.average /= durations.size();

    return stats;

void TestScheduler(const int threadCount, const int iterations, const bool testStop)
    std::cout << "\nthreads: " << threadCount << ", iterations: " << iterations << ", testStop: " << (testStop ? "true" : "false") << "\n";

    for (auto i = 0; i < iterations; i++)
        std::vector<Test> tests(threadCount);
        HANDLE testThreadsEnded = NULL;

        if (testStop)
            testThreadsEnded = CreateThread(NULL, 0, TestThreadsEnded, (void*)& tests, 0, NULL);

        for (auto& test : tests)
            test.Creation = steady_clock::now();
            test.Thread = CreateThread(NULL, 0, ThreadProc, (void*)& test, 0, NULL);

        if (testStop)
            WaitForSingleObject(testThreadsEnded, INFINITE);
            std::vector<HANDLE> threads;
            for (auto& test : tests) threads.push_back(test.Thread);
            WaitForMultipleObjects((DWORD)threads.size(), threads.data(), TRUE, INFINITE);

        std::vector<double> startDurations;
        std::vector<double> stopDurations;
        for (auto& test : tests)
            startDurations.push_back(diff(test.Creation, test.Started).count());
            stopDurations.push_back(diff(test.Started, test.Stopped).count());

        auto startStats = stats(startDurations);
        auto stopStats = stats(stopDurations);

        std::cout << std::fixed << std::setprecision(4);
        std::cout << "Start(" << startStats.min << ", " << startStats.average << ", " << startStats.max << ")";
        if (testStop)
            std::cout << " - ";
            std::cout << "Stop(" << stopStats.min << ", " << stopStats.average << ", " << stopStats.max << ")";
        std::cout << "\n";

int main(void)
    TestScheduler(100, 1, true);
    TestScheduler(32, 3, true);
    TestScheduler(4, 3, true);
    TestScheduler(4, 3, false);
    TestScheduler(1, 10, false);
    return 0;