诊断

Question

我有2个未排序的数组和2个这些数组的副本。我使用两个不同的线程来排序两个数组，然后我逐个排序其他两个未排序的数组。我认为线程进程会更快，但事实并非如此，那么线程如何花费更多时间呢？

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>

struct thread_data
{
    int count;
    unsigned int *arr;
};

struct thread_data thread_data_array[2];

void insertionSort(unsigned int arr[], int n)
{
   int i, key, j;
   for (i = 1; i < n; i++)
   {
       key = arr[i];
       j = i-1;

       while (j >= 0 && arr[j] > key)
       {
           arr[j+1] = arr[j];
           j = j-1;
       }
       arr[j+1] = key;
   }
}

void *sortAndMergeArrays(void *threadarg)
{
    int count;
    unsigned int *arr;
    struct thread_data *my_data;

    my_data = (struct thread_data *) threadarg;
    count = my_data->count;
    arr =  my_data->arr;

    insertionSort(arr, count);

    pthread_exit(NULL);
}

int main(int argc, char *argv[])
{
    int count, i, rc;
    clock_t start, end, total_t;
    pthread_t threads[2];

    //get the loop count. If loop count is not provided take 10000 as default loop count.
    if(argc == 2){
        count = atoi(argv[1]);
    }
    else{
        count = 10000;
    }

    unsigned int arr1[count], arr2[count], copyArr1[count], copyArr2[count];    

    srand(time(0));

    for(i = 0; i<count; i++){
        arr1[i] = rand();
        arr2[i] = rand();

        copyArr1[i] = arr1[i];
        copyArr2[i] = arr2[i];
    }

    start = clock();
    for(int t=0; t<2; t++) {
        thread_data_array[t].count = count;
        if(t==0)
            thread_data_array[t].arr = arr1;
        else
            thread_data_array[t].arr = arr2;    

        rc = pthread_create(&threads[t], NULL, sortAndMergeArrays, (void *) &thread_data_array[t]);
        if (rc) {
                printf("ERROR; return code from pthread_create() is %d\n", rc);
                exit(-1);
            }
    }

    pthread_join(threads[0], NULL);
    pthread_join(threads[1], NULL);
    end = clock();

    total_t = (double)(end - start);
    printf("Total time taken by CPU to sort using threads: %d\n", total_t);


    start = clock();
    insertionSort(copyArr1, count);
    insertionSort(copyArr2, count);
    end = clock();

    total_t = (double)(end - start);
    printf("Total time taken by CPU to sort sequentially: %d\n", total_t);

    pthread_exit(NULL);
}

我正在使用Linux服务器来执行代码。首先，我随机填充数组并将它们复制到两个单独的数组。对于前两个数组，我使用pthread创建两个线程并将两个数组传递给它们，它使用插入排序对它们进行排序。而对于其他两个数组，我只是逐个排序。

我预计通过使用线程我会减少执行时间但实际上需要更多时间。

Answer 1

诊断

你得到几乎相同的时间 - 从线程代码比从顺序代码稍微多一点时间 - 的原因是clock()测量CPU时间，两种排序方式占用几乎相同的CPU时间，因为他们正在做同样的工作（由于设置和拆除线程的时间，线程数可能略大）。

clock()函数应返回实现对自进程使用的处理器时间的最佳近似值，因为实现定义的时代仅与进程调用相关。

BSD（macOS）手册页：

clock()函数确定自调用调用进程以来使用的处理器时间量，以CLOCKS_PER_SECs为单位测量。

对两个数组进行排序所需的CPU时间基本相同;差异是线程的开销（或多或少）。

修订代码

我有一组可以使用clock_gettime()的功能（timer.c中的代码和GitHub中的timer.h）。这些测量挂钟时间 - 经过的时间，而不是CPU时间。

这里是您的代码的温和调整版本 - 实质性更改正在将排序函数中key的类型从int更改为unsigned int以匹配数据中的数据数组，并修复%d到%ld的转换规范，以匹配GCC标识为clock_t的类型。我温和地调整了参数处理和时序消息，使它们的长度保持一致，并添加了经过时间的测量代码：

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>
#include "timer.h"

struct thread_data
{
    int count;
    unsigned int *arr;
};

struct thread_data thread_data_array[2];

static
void insertionSort(unsigned int arr[], int n)
{
    for (int i = 1; i < n; i++)
    {
        unsigned int key = arr[i];
        int j = i - 1;

        while (j >= 0 && arr[j] > key)
        {
            arr[j + 1] = arr[j];
            j = j - 1;
        }
        arr[j + 1] = key;
    }
}

static
void *sortAndMergeArrays(void *threadarg)
{
    int count;
    unsigned int *arr;
    struct thread_data *my_data;

    my_data = (struct thread_data *)threadarg;
    count = my_data->count;
    arr =  my_data->arr;

    insertionSort(arr, count);

    pthread_exit(NULL);
}

int main(int argc, char *argv[])
{
    int count = 10000;
    int i, rc;
    clock_t start, end, total_t;
    pthread_t threads[2];

    // get the loop count. If loop count is not provided take 10000 as default loop count.
    if (argc == 2)
        count = atoi(argv[1]);

    unsigned int arr1[count], arr2[count], copyArr1[count], copyArr2[count];

    srand(time(0));

    for (i = 0; i < count; i++)
    {
        arr1[i] = rand();
        arr2[i] = rand();

        copyArr1[i] = arr1[i];
        copyArr2[i] = arr2[i];
    }

    Clock clk;
    clk_init(&clk);

    start = clock();
    clk_start(&clk);
    for (int t = 0; t < 2; t++)
    {
        thread_data_array[t].count = count;
        if (t == 0)
            thread_data_array[t].arr = arr1;
        else
            thread_data_array[t].arr = arr2;

        rc = pthread_create(&threads[t], NULL, sortAndMergeArrays, (void *)&thread_data_array[t]);
        if (rc)
        {
            printf("ERROR; return code from pthread_create() is %d\n", rc);
            exit(-1);
        }
    }

    pthread_join(threads[0], NULL);
    pthread_join(threads[1], NULL);
    clk_stop(&clk);
    end = clock();

    char buffer[32];
    printf("Elapsed using threads:  %s s\n", clk_elapsed_us(&clk, buffer, sizeof(buffer)));

    total_t = (double)(end - start);
    printf("CPU time using threads: %ld\n", total_t);

    start = clock();
    clk_start(&clk);
    insertionSort(copyArr1, count);
    insertionSort(copyArr2, count);
    clk_stop(&clk);
    end = clock();

    printf("Elapsed sequentially:   %s s\n", clk_elapsed_us(&clk, buffer, sizeof(buffer)));
    total_t = (double)(end - start);
    printf("CPU time sequentially:  %ld\n", total_t);

    return 0;
}

结果

示例运行（程序inssortthread23） - 在带有16 GiB RAM和2.7 GHz Intel Core i7 CPU的MacBook Pro（15＆＃34; 2016）上运行，运行macOS High Sierra 10.13，使用GCC 7.2.0汇编。我有例行的后台程序 - 例如浏览器没有被主动使用，没有音乐或视频播放，没有正在进行的下载等。（这些事情对于基准测试非常重要。）

$ inssortthread23 100000
Elapsed using threads:  1.060299 s
CPU time using threads: 2099441
Elapsed sequentially:   2.146059 s
CPU time sequentially:  2138465
$ inssortthread23 200000
Elapsed using threads:  4.332935 s
CPU time using threads: 8616953
Elapsed sequentially:   8.496348 s
CPU time sequentially:  8469327
$ inssortthread23 300000
Elapsed using threads:  9.984021 s
CPU time using threads: 19880539
Elapsed sequentially:   20.000900 s
CPU time sequentially:  19959341
$

结论

在这里，您可以清楚地看到：

非线程代码的经过时间大约是线程代码的两倍。
线程和非线程代码的CPU时间几乎相同。
整体时间是排序行数的二次方。

所有这些都非常符合预期 - 一旦你意识到clock()正在测量CPU时间，而不是经过时间。

小谜题

您还可以看到，在某些情况下，我的线程CPU时间略小于顺序排序的CPU时间。我没有对此作出解释 - 我认为它已经在噪音中消失了，尽管效果持续存在：

$ inssortthread23 100000
Elapsed using threads:  1.051229 s
CPU time using threads: 2081847
Elapsed sequentially:   2.138538 s
CPU time sequentially:  2132083
$ inssortthread23 100000
Elapsed using threads:  1.053656 s
CPU time using threads: 2089886
Elapsed sequentially:   2.128908 s
CPU time sequentially:  2122983
$ inssortthread23 100000
Elapsed using threads:  1.058283 s
CPU time using threads: 2093644
Elapsed sequentially:   2.126402 s
CPU time sequentially:  2120625
$

$ inssortthread23 200000
Elapsed using threads:  4.259660 s
CPU time using threads: 8479978
Elapsed sequentially:   8.872929 s
CPU time sequentially:  8843207
$ inssortthread23 200000
Elapsed using threads:  4.463954 s
CPU time using threads: 8883267
Elapsed sequentially:   8.603401 s
CPU time sequentially:  8580240
$ inssortthread23 200000
Elapsed using threads:  4.227154 s
CPU time using threads: 8411582
Elapsed sequentially:   8.816412 s
CPU time sequentially:  8797965
$

使用2个线程对2个数组进行排序比逐个对2个数组进行排序需要更多的时间

1 个答案:

诊断

修订代码

结果

结论

小谜题