我正在编写一个程序来对数组中的N个数字求和。为此,程序可以将其执行分成多个线程,每个线程对数组的N / numThreads位置求和。
然而,到目前为止我执行的最快的执行时间是使用1个线程。使用4个线程可能需要两倍的时间!我已经对这个问题做了很多考虑(包括虚假分享的可能性),但我无法想到解决方案。我真的无法看到发生虚假分享的地方,如果有的话。
我正在运行Linux Mint并通过输入gcc spinlock.c -o spinlock -lpthread -lm使用GCC进行编译 我的代码如下。请随意编译并随意使用它。您可以通过更改程序中的“定义”来更改N(数组中元素的数量)的值,并且可以将任意数量的线程作为程序的参数传递:
#define _GNU_SOURCE
#include <pthread.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <ctype.h>
#include <sys/types.h>
#include <inttypes.h>
#include <sched.h>
#include <math.h>
#include <time.h>
#define N 10000000
long sumTotal = 0; /* Global sum variable */
int lock = 0; /* Lock for sum variable access */
struct threadInfo { /* Argument passed to the function the thread will execute */
pthread_t threadId; /* Returned ID by pthread_create() */
int threadNum; /* Thread number (from 1 to threadNum) */
unsigned long sectionSize; /* Section size to operate */
int8_t* section; /* Pointer to start of the array to be summed */
};
void acquire(int* _lock) {
while (__sync_lock_test_and_set(_lock, 1));
}
void release(int* _lock) {
*_lock = 0;
}
void sumNum(void* arg){
struct threadInfo *tinfo = arg;
long i, sum = 0;
for (i=0; i < tinfo->sectionSize; i++) {
sum += tinfo->section[i];
}
acquire(&lock);
sumTotal += sum;
release(&lock);
}
int main(int argc, char** argv)
{
int8_t* numbers;
unsigned long sectionSize, currNum;
unsigned int currThr, numThreads;
int err, num_cores, i, core_interval, t1, t2;
double arithmetics;
pthread_attr_t attr;
struct threadInfo *tinfo;
/* Invalid number of arguments: */
if (argc != 2){
fprintf(stderr, "Usage: %s [threads]\n", argv[0]);
exit(-1);
}
/* Converts argv to int: (K) */
numThreads = (int) strtol(argv[1], (char**) NULL, 10);
sectionSize = N/numThreads;
/* Randoms the seed: */
srand(time(NULL));
/* Fills array with numbers: */
numbers = (int8_t*) malloc(N);
for (currNum=0; currNum<N; currNum++){
numbers[currNum] = (int8_t) (((rand())%199)-99);
}
/* Initializes struct containing default attributes of threads: */
err = pthread_attr_init(&attr);
if (err != 0){
fprintf(stderr, "Failed.\n");
exit(-2);
}
/* Allocates memory for pthread_create arguments: */
tinfo = calloc(numThreads, sizeof(struct threadInfo));
if (tinfo == NULL){
fprintf(stderr, "Failed.\n");
exit(-3);
}
/* Thread affinity operations: */
num_cores = sysconf(_SC_NPROCESSORS_ONLN);
cpu_set_t cpuset[num_cores];
for (i=0; i<num_cores; i++){
CPU_ZERO(&(cpuset[i]));
CPU_SET(i+1, &(cpuset[i]));
}
core_interval = N/num_cores;
t1 = clock();
/* Initialize threads: */
for (currThr=0; currThr<numThreads; currThr++){
tinfo[currThr].threadNum = currThr + 1;
tinfo[currThr].section = &(*(numbers+currThr*sectionSize));
/* If this is the last thread, takes the remainder of the array: */
if (currThr+1 == numThreads) {
tinfo[currThr].sectionSize = N - (currThr*sectionSize);
/* Other threads: */
} else {
tinfo[currThr].sectionSize = sectionSize;
}
/* Calculates to which core a thread should go: */
arithmetics = (double) (currThr+1)*num_cores/numThreads;
i = ceil(arithmetics);
tinfo[currThr].threadId = pthread_self();
/* This line, if uncommented, sets a group of threads to a specific cpu (core): */
//pthread_setaffinity_np(tinfo[currThr].threadId, sizeof(cpu_set_t), &cpuset[i-1]);
/* Creates the thread: */
err = pthread_create(&tinfo[currThr].threadId,
&attr,
(void*) &sumNum,
&tinfo[currThr]);
if (err != 0){
fprintf(stderr, "Failed to create thread.\n");
exit(-4);
}
}
void* ret = NULL;
/* Joins threads: */
for (currThr=0; currThr<numThreads; currThr++) {
pthread_join(tinfo[currThr].threadId, &ret);
}
t2 = clock();
printf("Total sum: %ld. Total time: %f\n", sumTotal, (t2-t1*1.0)/CLOCKS_PER_SEC);
return 0;
}
感谢您的时间,感谢任何帮助!