我将多线程Linux应用程序移植到Windows,并在运行Windows 10 Pro的服务器上对其进行测试。与在同一双引导硬件上运行的Linux版本相比,Windows版本的性能令人讨厌。我将代码简化为一个显示相同症状的小型多线程示例。我希望SO社区可以提供一些有关Windows和Linux在此应用程序之间为何存在性能差异的见解,以及有关如何解决此问题的建议。
我正在测试的计算机具有双Intel Xeon Gold 6136 CPU(24/48物理/逻辑内核)@ 3.0 GHz(Turbo-boost到3.6 GHz),具有128 GB内存。该计算机设置为双启动CentOS或Windows10。没有Windows Hypervisor运行(禁用Hyper-V)。 NUMA已禁用。在我执行的测试中,每个线程应能够在单独的内核上运行;没有其他消耗处理器的应用程序在运行。
应用程序执行复杂的转换,将大约15 MB的输入数据集转换为大约50 MB的输出数据。我编写了简化的多线程测试(仅用于计算,仅用于数据移动等)来缩小问题的范围。仅计算的测试没有性能差异,但是数据复制方案确实如此。可重复的场景只是让每个线程将数据从其15 MB的输入缓冲区复制到其50 MB的输出缓冲区。输入缓冲区中的每个“ int”被连续写入输出缓冲区3次。下面显示了使用N个线程进行100次迭代的几乎相同的Linux和Windows代码的结果:
Windows (or cygwin) Linux (native)
Threads Time (msec) Time (msec)
1 4200 3000
2 4020 2300
3 4815 2300
4 6700 2300
5 8900 2300
6 14000 2300
7 16500 2300
8 21000 2300
12 39000 2500
16 75000 3000
24 155000 4000
以上时间是工作线程中的处理时间。结果不包括分配内存或启动线程的任何时间。看来线程可以在Linux下独立运行,但不能在Windows 10下运行。
我用于Windows测试的完整C代码在这里:
//
// Thread test program
//
// To compile for Windows:
// vcvars64.bat
// cl /Ox -o windowsThreadTest windowsThreadTest.c
//
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <windows.h>
#include <process.h>
#define __func__ __FUNCTION__
//
// Global data
//
HANDLE *threadHandleArray = NULL;
DWORD *threadIdArray = NULL;
//
// Time keeping
//
double *PCFreq = NULL;
__int64 *CounterStart = NULL;
void StartCounter(int whichProcessor)
{
LARGE_INTEGER li;
DWORD_PTR old_mask;
if ( !PCFreq )
{
printf("No freq array\n");
return;
}
if(!QueryPerformanceFrequency(&li))
{
printf("QueryPerformanceFrequency failed!\n");
return;
}
PCFreq[whichProcessor] = ((double)(li.QuadPart))/1000.0;
QueryPerformanceCounter(&li);
CounterStart[whichProcessor] = li.QuadPart;
}
double GetCounter()
{
LARGE_INTEGER li;
DWORD_PTR old_mask;
DWORD whichProcessor;
whichProcessor = GetCurrentProcessorNumber();
if ( CounterStart && CounterStart[whichProcessor] != 0 )
{
QueryPerformanceCounter(&li);
return ((double)(li.QuadPart-CounterStart[whichProcessor]))/PCFreq[whichProcessor];
}
else
return 0.0;
}
typedef struct
{
int retVal;
int instance;
long myTid;
int verbose;
double startTime;
double elapsedTime;
double totalElapsedTime;
struct {
unsigned intsToCopy;
int *inData;
int *outData;
} rwInfo;
} info_t;
int rwtest( unsigned intsToCopy, int *inData, int *outData)
{
unsigned i, j;
//
// Test is simple. For every entry in input array, write 3 entries to output
//
for ( j = i = 0; i < intsToCopy; i++ )
{
outData[j] = inData[i];
outData[j+1] = inData[i];
outData[j+2] = inData[i];
j += 3;
}
return 0;
}
DWORD WINAPI workerProc(LPVOID *workerInfoPtr)
{
info_t *infoPtr = (info_t *)workerInfoPtr;
infoPtr->myTid = GetCurrentThreadId();
double endTime;
BOOL result;
SetThreadPriority(threadHandleArray[infoPtr->instance], THREAD_PRIORITY_HIGHEST);
// record start time
infoPtr->startTime = GetCounter();
// Run the test
infoPtr->retVal = rwtest( infoPtr->rwInfo.intsToCopy, infoPtr->rwInfo.inData, infoPtr->rwInfo.outData );
// end time
endTime = GetCounter();
infoPtr->elapsedTime = endTime - infoPtr->startTime;
if ( infoPtr->verbose )
printf("(%04x): done\n", infoPtr->myTid);
return 0;
}
//
// Main Test Program
//
int main(int argc, char **argv)
{
int i, j, verbose=0, loopLimit;
unsigned size;
unsigned int numThreads;
info_t *w_info = NULL;
int numVirtualCores;
SYSTEM_INFO sysinfo;
GetSystemInfo(&sysinfo);
if ( argc != 4 )
{
printf("windowsThreadTest <numLoops> <numThreads> <Input size in MB>\n");
return -1;
}
numVirtualCores = sysinfo.dwNumberOfProcessors;
printf("%s: There are %d processors\n", __func__, numVirtualCores);
// Setup Timing
PCFreq = (double *)malloc(numVirtualCores * sizeof(double));
CounterStart = (__int64 *)malloc(numVirtualCores * sizeof(__int64));
if (!PCFreq || !CounterStart)
goto free_and_exit;
for ( i = 0; i < numVirtualCores; i++)
StartCounter(i);
//
// Process input args
//
loopLimit = atoi( argv[1] );
numThreads = atoi( argv[2] );
size = atoi( argv[3] ) * 1024 * 1024;
//
// Setup data array for each thread
//
w_info = (info_t *)malloc( numThreads * sizeof(info_t) );
if ( !w_info )
{
printf("Couldn't allocate w_info of size %zd, numThreads=%d\n", sizeof(info_t), numThreads);
goto free_and_exit;
}
memset( w_info, 0, numThreads * sizeof(info_t) );
//
// Thread Handle Array
//
threadHandleArray = (HANDLE *)malloc( numThreads * sizeof(HANDLE) );
if ( !threadHandleArray )
{
printf("Couldn't allocate handleArray\n");
goto free_and_exit;
}
//
// Thread ID Array
//
threadIdArray = (DWORD *)malloc( numThreads * sizeof(DWORD) );
if ( !threadIdArray )
{
printf("Couldn't allocate IdArray\n");
goto free_and_exit;
}
//
// Run the test
//
printf("Read/write testing... threads %d loops %lu input size %u \n", numThreads, loopLimit, size);
for ( j = 0; j < loopLimit; j++ )
{
//
// Set up the data for the threads
//
for ( i = 0; i < numThreads; i++ )
{
int idx;
int *inData;
int *outData;
unsigned inSize;
unsigned outSize;
inSize = size; // in MB
outSize = size * 3; // in MB
//
// Allocate input buffer
//
inData = (int *) malloc( inSize );
if ( !inData )
{
printf("Error allocating inData of size %zd\n", inSize * sizeof(char));
goto free_and_exit;
}
else
{
if ( verbose )
printf("Allocated inData of size %zd\n", inSize * sizeof(char));
}
//
// Allocate output buffer 3x the size of the input buf
//
outData = (int *) malloc( outSize * 3 );
if ( !outData )
{
printf("Error allocating outData of size %zd\n", outSize * sizeof(char));
goto free_and_exit;
}
else
{
if ( verbose )
printf("Allocated outData of size %zd\n", outSize * sizeof(char));
}
//
// Put some data into input buffer
//
w_info[i].rwInfo.intsToCopy = inSize/sizeof(int);
for ( idx = 0; idx < w_info[i].rwInfo.intsToCopy; idx++)
inData[idx] = idx;
w_info[i].rwInfo.inData = inData;
w_info[i].rwInfo.outData = outData;
w_info[i].verbose = verbose;
w_info[i].instance = i;
w_info[i].retVal = -1;
}
//
// Start the threads
//
for ( i = 0; i < numThreads; i++ )
{
threadHandleArray[i] = CreateThread( NULL, 0, workerProc, &w_info[i], 0, &threadIdArray[i] );
if ( threadHandleArray[i] == NULL )
{
fprintf(stderr, "Error creating thread %d\n", i);
return 1;
}
}
//
// Wait until all threads have terminated.
//
WaitForMultipleObjects( numThreads, threadHandleArray, TRUE, INFINITE );
//
// Check the return values
//
for ( i = 0; i < numThreads; i++ )
{
if ( w_info[i].retVal < 0 )
{
printf("Error return from thread %d\n", i);
goto free_and_exit;
}
if ( verbose )
printf("Thread %d, tid %x %f msec\n", i, (unsigned)w_info[i].myTid, w_info[i].elapsedTime);
w_info[i].totalElapsedTime += w_info[i].elapsedTime;
}
//
// Free up the data from this iteration
//
for ( i = 0; i < numThreads; i++ )
{
free( w_info[i].rwInfo.inData );
free( w_info[i].rwInfo.outData );
CloseHandle( threadHandleArray[i] );
}
}
//
// All done, print out cumulative time spent in worker routine
//
for ( i = 0; i < numThreads; i++ )
{
printf("Thread %d, loops %d %f msec\n", i, j, w_info[i].totalElapsedTime);
}
free_and_exit:
if ( threadHandleArray )
free( threadHandleArray );
if ( threadIdArray )
free( threadIdArray );
if ( PCFreq )
free( PCFreq );
if ( CounterStart )
free( CounterStart );
if ( w_info )
free( w_info );
return 0;
}
上面的代码很容易更改为利用pthread,并使用命令行'gcc -O3 -o pthreadTestLinux pthreadTest.c'进行编译,以获得上述的Linux结果(如有必要,我可以发布)。如果在cygwin环境中使用gcc在Windows上编译,则结果将反映使用Windows示例代码的结果。
我已经尝试了各种BIOS设置,提高了线程优先级,预分配了线程池等,而性能没有变化。我认为这不是 false-sharing 的情况,因为Linux版本使用几乎相同的代码显示出根本不同的性能。我想知道我的编译方式中是否包含某些内容。我正在使用64位工具链。
有什么想法吗?
答案 0 :(得分:2)
我已经在多核/多处理器机器上看到Cygwin应用程序的类似问题。据我所知,这在Cygwin中仍然是一个未解决的问题。
我注意到(可以尝试)的一件事是,将进程固定到单个CPU可能会大大提高其性能(但显然也会限制利用多核和多线程并行性的能力)。您可以使用Windows任务管理器将进程关联性设置为仅一个CPU /核心,从而将进程固定到单个CPU。
如果这样做可以显着提高单个线程的性能,那么您将看到我注意到的相同问题。而且,我认为那不是您的代码有问题,而是Cygwin的问题。
答案 1 :(得分:0)
我很想知道Windows的性能与golang中多线程内存转换问题的Linux性能相比如何,因此我将代码移植到尽可能接近原始代码的位置,然后做了一些相同的事情在类似的硬件平台上进行性能测试。
与发布的问题中看到的结果不同,随着同时进行的操作数量的增加,golang代码没有爆炸。相应的性能图表为:
Num Threads Time in Process
1 4000
2 4100
4 4200
6 3600
12 3600
16 3800
24 3700
这些结果比您在Linux上运行的C代码中显示的结果要慢得多。
不确定是否有任何帮助,但是看起来Windows 10存在一个普遍问题,在执行一些内存操作时会导致多线程性能问题,但是似乎与C的性能有关如问题中所述,由cl和gcc(cygwin)编译时生成的代码。
golang代码为:
package main
import "fmt"
import "os"
import "time"
import "strconv"
func rwtest(intsToCopy int, inData *[]int, outData *[]int) {
var i int
var j int
j = 0
for i=0 ; i<intsToCopy ; i++ {
(*outData)[j + 0] = (*inData)[i]
(*outData)[j + 1] = (*inData)[i]
(*outData)[j + 2] = (*inData)[i]
j += 3
}
}
func workerProc(threadNum int, reportChan chan int, numLoops int, dataSize int) {
var i int
var inData []int
var outData []int
var cumulativeTime time.Duration
cumulativeTime = 0
for i=0 ; i<numLoops ; i++ {
inData = make([]int, dataSize, dataSize)
outData = make([]int, dataSize * 3, dataSize * 3)
startTime := time.Now()
rwtest(dataSize, &inData, &outData)
endTime := time.Now()
cumulativeTime += endTime.Sub(startTime)
inData = nil
outData = nil
}
// Print out the cumulative time
fmt.Printf("Thread %d duration is %d\n", threadNum, cumulativeTime)
// Write out to the channel
reportChan <- 0
}
func main() {
var i int
if len(os.Args) != 4 {
fmt.Printf("Usage: %s <num threads> <num loops> <data size>\n", os.Args[0])
return
}
numThreads, _ := strconv.Atoi(os.Args[1])
numLoops, _ := strconv.Atoi(os.Args[2])
dataSize, _ := strconv.Atoi(os.Args[3])
fmt.Printf("Running Program with %d threads, with %d loops\n", numThreads, numLoops)
// Make a channel for each thread
var chans []chan int
for i=0 ; i<numThreads ; i++ {
chans = append(chans, make(chan int))
}
// start the threads
for i=0 ; i<numThreads ; i++ {
go workerProc(i, chans[i], numLoops, dataSize)
}
var x int
// Loop through the channels, waiting for each go routine to finish
for i=0 ; i<numThreads ; i++ {
x = <-chans[i]
}
fmt.Printf("Done: %d\n", x)
}
答案 2 :(得分:0)
Youtubers 1级技术人员也在Threadripper处理器上看到了这一点。长话短说,就是Windows 10内核似乎在程序运行时将内核FAR之间的线程改组到FAR了很多。 https://www.youtube.com/watch?v=M2LOMTpCtLA
我也不知道这是否也是Server 2016或2019内核的问题。我自己是Threadripper 2950x的新所有者,我真的很想解决这个问题。