Question

我正在编写一个程序来对数组中的N个数字求和。为此，程序可以将其执行分成多个线程，每个线程对数组的N / numThreads位置求和。

然而，到目前为止我执行的最快的执行时间是使用1个线程。使用4个线程可能需要两倍的时间！我已经对这个问题做了很多考虑（包括虚假分享的可能性），但我无法想到解决方案。我真的无法看到发生虚假分享的地方，如果有的话。

我正在运行Linux Mint并通过输入gcc spinlock.c -o spinlock -lpthread -lm使用GCC进行编译我的代码如下。请随意编译并随意使用它。您可以通过更改程序中的“定义”来更改N（数组中元素的数量）的值，并且可以将任意数量的线程作为程序的参数传递：

#define _GNU_SOURCE

#include <pthread.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <ctype.h>
#include <sys/types.h>
#include <inttypes.h>
#include <sched.h>
#include <math.h>
#include <time.h>

#define N 10000000

long sumTotal = 0; /* Global sum variable */
int lock = 0;    /* Lock for sum variable access */

struct threadInfo {            /* Argument passed to the function the thread will execute */
    pthread_t     threadId;    /* Returned ID by pthread_create() */
    int           threadNum;   /* Thread number (from 1 to threadNum) */
    unsigned long sectionSize; /* Section size to operate */
    int8_t*       section;     /* Pointer to start of the array to be summed */
};

void acquire(int* _lock) {
    while (__sync_lock_test_and_set(_lock, 1));
}

void release(int* _lock) {
    *_lock = 0;
}

void sumNum(void* arg){
    struct threadInfo *tinfo = arg;
    long i, sum = 0;

    for (i=0; i < tinfo->sectionSize; i++) {
        sum += tinfo->section[i];
    }


    acquire(&lock);
    sumTotal += sum;
    release(&lock);
}

int main(int argc, char** argv)
{
    int8_t* numbers;
    unsigned long sectionSize, currNum;
    unsigned int currThr, numThreads;
    int err, num_cores, i, core_interval, t1, t2;
    double arithmetics;
    pthread_attr_t attr;
    struct threadInfo *tinfo;

    /* Invalid number of arguments: */
    if (argc != 2){
        fprintf(stderr, "Usage: %s [threads]\n", argv[0]);
        exit(-1);
    }

    /* Converts argv to int: (K) */
    numThreads = (int) strtol(argv[1], (char**) NULL, 10);
    sectionSize = N/numThreads;

    /* Randoms the seed: */
    srand(time(NULL));

    /* Fills array with numbers: */
    numbers = (int8_t*) malloc(N);
    for (currNum=0; currNum<N; currNum++){
        numbers[currNum] = (int8_t) (((rand())%199)-99);
    }

    /* Initializes struct containing default attributes of threads: */
    err = pthread_attr_init(&attr);
    if (err != 0){
        fprintf(stderr, "Failed.\n");
        exit(-2);
    }

    /* Allocates memory for pthread_create arguments: */
    tinfo = calloc(numThreads, sizeof(struct threadInfo));
    if (tinfo == NULL){
        fprintf(stderr, "Failed.\n");
        exit(-3);
    }

    /* Thread affinity operations: */
    num_cores = sysconf(_SC_NPROCESSORS_ONLN);
    cpu_set_t cpuset[num_cores];
    for (i=0; i<num_cores; i++){
        CPU_ZERO(&(cpuset[i]));
        CPU_SET(i+1, &(cpuset[i]));
    }
    core_interval = N/num_cores;

    t1 = clock();

    /* Initialize threads: */
    for (currThr=0; currThr<numThreads; currThr++){
        tinfo[currThr].threadNum = currThr + 1;
        tinfo[currThr].section = &(*(numbers+currThr*sectionSize));
        /* If this is the last thread, takes the remainder of the array:     */
        if (currThr+1 == numThreads) {
            tinfo[currThr].sectionSize = N - (currThr*sectionSize);
        /* Other threads: */
        } else {
            tinfo[currThr].sectionSize = sectionSize;
        }

        /* Calculates to which core a thread should go: */
        arithmetics = (double) (currThr+1)*num_cores/numThreads;
        i = ceil(arithmetics);
        tinfo[currThr].threadId = pthread_self();
        /* This line, if uncommented, sets a group of threads to a specific cpu (core): */
        //pthread_setaffinity_np(tinfo[currThr].threadId, sizeof(cpu_set_t), &cpuset[i-1]);

        /* Creates the thread: */
        err = pthread_create(&tinfo[currThr].threadId,
                             &attr,
                             (void*) &sumNum,
                             &tinfo[currThr]);
        if (err != 0){
            fprintf(stderr, "Failed to create thread.\n");
            exit(-4);
        }
    }

    void* ret = NULL;

    /* Joins threads: */
    for (currThr=0; currThr<numThreads; currThr++) {
        pthread_join(tinfo[currThr].threadId, &ret);
    }

    t2 = clock();

    printf("Total sum: %ld. Total time: %f\n", sumTotal, (t2-t1*1.0)/CLOCKS_PER_SEC);
    return 0;
}

感谢您的时间，感谢任何帮助！

具有4个线程的数组总和比1个线程慢

0 个答案: