我已将一个复杂的数组处理任务拆分为多个线程,以利用多核处理,并且看到了很多好处。目前,在任务开始时我创建线程,然后在完成工作时等待它们终止。我通常创建的线程数量是核心数量的四倍,因为每个线程可能需要花费不同的时间,并且拥有额外的线程可确保所有核心在大多数时间内保持占用状态。我想知道在程序启动时创建线程会有多大的性能优势,让它们保持空闲直到需要,并在我开始处理时使用它们。更简单地说,在线程内部处理之上和之后开始和结束新线程需要多长时间?我现在正在使用
启动线程CWinThread *pMyThread = AfxBeginThread(CMyThreadFunc,&MyData,THREAD_PRIORITY_NORMAL);
通常,我将在64位架构上使用8个内核的32个线程。目前所讨论的过程需要< 1秒,每次刷新显示时都会启动。如果开始和结束一个线程是< 1ms,回报并不能证明这一努力。我在分析这个问题时遇到了一些困难。
A related question here有所帮助但对我所追求的内容有点模糊。任何反馈意见。
答案 0 :(得分:16)
#include <windows.h>
#include <iostream>
#include <time.h>
#include <vector>
const int num_threads = 32;
const int switches_per_thread = 100000;
DWORD __stdcall ThreadProc(void *start) {
QueryPerformanceCounter((LARGE_INTEGER *) start);
for (int i=0;i<switches_per_thread; i++)
Sleep(0);
return 0;
}
int main(void) {
HANDLE threads[num_threads];
DWORD junk;
std::vector<LARGE_INTEGER> start_times(num_threads);
LARGE_INTEGER l;
QueryPerformanceCounter(&l);
clock_t create_start = clock();
for (int i=0;i<num_threads; i++)
threads[i] = CreateThread(NULL,
0,
ThreadProc,
(void *)&start_times[i],
0,
&junk);
clock_t create_end = clock();
clock_t wait_start = clock();
WaitForMultipleObjects(num_threads, threads, TRUE, INFINITE);
clock_t wait_end = clock();
double create_millis = 1000.0 * (create_end - create_start) / CLOCKS_PER_SEC / num_threads;
std::cout << "Milliseconds to create thread: " << create_millis << "\n";
double wait_clocks = (wait_end - wait_start);
double switches = switches_per_thread*num_threads;
double us_per_switch = wait_clocks/CLOCKS_PER_SEC*1000000/switches;
std::cout << "Microseconds per thread switch: " << us_per_switch;
LARGE_INTEGER f;
QueryPerformanceFrequency(&f);
for (auto s : start_times)
std::cout << 1000.0 * (s.QuadPart - l.QuadPart) / f.QuadPart <<" ms\n";
return 0;
}
示例结果:
Milliseconds to create thread: 0.015625
Microseconds per thread switch: 0.0479687
前几个线程启动时间如下所示:
0.0632517 ms
0.117348 ms
0.143703 ms
0.18282 ms
0.209174 ms
0.232478 ms
0.263826 ms
0.315149 ms
0.324026 ms
0.331516 ms
0.3956 ms
0.408639 ms
0.4214 ms
请注意,虽然这些都是单调递增的,但不保证(尽管这个方向肯定存在趋势)。
当我第一次写这篇文章时,我使用的单位更有意义 - 在33 MHz 486上,这些结果并非像这样的微小分数。 :-)我想有一天当我感到雄心勃勃时,我应该重写这个以使用std::async
创建线程和std::chrono
来做时间,但是......
答案 1 :(得分:3)
一些建议:
答案 2 :(得分:0)
我对现代Windows调度程序感到好奇,因此我编写了另一个测试应用程序。我尽最大努力通过可选地旋转观察线程来测量线程停止时间。
// Tested on Windows 10 v1903 with E5-1660 v3 @ 3.00GHz, 8 Core(s), 16 Logical Processor(s)
// Times are (min, average, max) in milliseconds.
threads: 100, iterations: 1, testStop: true
Start(0.1083, 5.3665, 13.7103) - Stop(0.0341, 1.5122, 11.0660)
threads: 32, iterations: 3, testStop: true
Start(0.1349, 1.6423, 3.5561) - Stop(0.0396, 0.2877, 3.5195)
Start(0.1093, 1.4992, 3.3982) - Stop(0.0351, 0.2734, 2.0384)
Start(0.1159, 1.5345, 3.5754) - Stop(0.0378, 0.4938, 3.2216)
threads: 4, iterations: 3, testStop: true
Start(0.2066, 0.3553, 0.4598) - Stop(0.0410, 0.1534, 0.4630)
Start(0.2769, 0.3740, 0.4994) - Stop(0.0414, 0.1028, 0.2581)
Start(0.2342, 0.3602, 0.5650) - Stop(0.0497, 0.2199, 0.3620)
threads: 4, iterations: 3, testStop: false
Start(0.1698, 0.2492, 0.3713)
Start(0.1473, 0.2477, 0.4103)
Start(0.1756, 0.2909, 0.4295)
threads: 1, iterations: 10, testStop: false
Start(0.1910, 0.1910, 0.1910)
Start(0.1685, 0.1685, 0.1685)
Start(0.1564, 0.1564, 0.1564)
Start(0.1504, 0.1504, 0.1504)
Start(0.1389, 0.1389, 0.1389)
Start(0.1234, 0.1234, 0.1234)
Start(0.1550, 0.1550, 0.1550)
Start(0.2800, 0.2800, 0.2800)
Start(0.1587, 0.1587, 0.1587)
Start(0.1877, 0.1877, 0.1877)
来源:
#include <windows.h>
#include <iostream>
#include <vector>
#include <chrono>
#include <iomanip>
using namespace std::chrono;
struct Test
{
HANDLE Thread = { 0 };
time_point<steady_clock> Creation;
time_point<steady_clock> Started;
time_point<steady_clock> Stopped;
};
DWORD __stdcall ThreadProc(void* lpParamater) {
auto test = (Test*)lpParamater;
test->Started = steady_clock::now();
return 0;
}
DWORD __stdcall TestThreadsEnded(void* lpParamater) {
auto& tests = *(std::vector<Test>*)lpParamater;
std::size_t finished = 0;
while (finished < tests.size())
{
for (auto& test : tests)
{
if (test.Thread != NULL && WaitForSingleObject(test.Thread, 0) == WAIT_OBJECT_0)
{
test.Stopped = steady_clock::now();
test.Thread = NULL;
finished++;
}
}
}
return 0;
}
duration<double, std::milli> diff(time_point<steady_clock> start, time_point<steady_clock> stop)
{
return stop - start;
}
struct Stats
{
double min;
double average;
double max;
};
Stats stats(const std::vector<double>& durations)
{
Stats stats = { 1000, 0, 0 };
for (auto& duration : durations)
{
stats.min = duration < stats.min ? duration : stats.min;
stats.max = duration > stats.max ? duration : stats.max;
stats.average += duration;
}
stats.average /= durations.size();
return stats;
}
void TestScheduler(const int threadCount, const int iterations, const bool testStop)
{
std::cout << "\nthreads: " << threadCount << ", iterations: " << iterations << ", testStop: " << (testStop ? "true" : "false") << "\n";
for (auto i = 0; i < iterations; i++)
{
std::vector<Test> tests(threadCount);
HANDLE testThreadsEnded = NULL;
if (testStop)
{
testThreadsEnded = CreateThread(NULL, 0, TestThreadsEnded, (void*)& tests, 0, NULL);
}
for (auto& test : tests)
{
test.Creation = steady_clock::now();
test.Thread = CreateThread(NULL, 0, ThreadProc, (void*)& test, 0, NULL);
}
if (testStop)
{
WaitForSingleObject(testThreadsEnded, INFINITE);
}
else
{
std::vector<HANDLE> threads;
for (auto& test : tests) threads.push_back(test.Thread);
WaitForMultipleObjects((DWORD)threads.size(), threads.data(), TRUE, INFINITE);
}
std::vector<double> startDurations;
std::vector<double> stopDurations;
for (auto& test : tests)
{
startDurations.push_back(diff(test.Creation, test.Started).count());
stopDurations.push_back(diff(test.Started, test.Stopped).count());
}
auto startStats = stats(startDurations);
auto stopStats = stats(stopDurations);
std::cout << std::fixed << std::setprecision(4);
std::cout << "Start(" << startStats.min << ", " << startStats.average << ", " << startStats.max << ")";
if (testStop)
{
std::cout << " - ";
std::cout << "Stop(" << stopStats.min << ", " << stopStats.average << ", " << stopStats.max << ")";
}
std::cout << "\n";
}
}
int main(void)
{
TestScheduler(100, 1, true);
TestScheduler(32, 3, true);
TestScheduler(4, 3, true);
TestScheduler(4, 3, false);
TestScheduler(1, 10, false);
return 0;
}