串行代码花费的时间不同,因为它与单独的代码一起运行或与并行代码

时间:2016-01-01 20:48:28

标签: c multithreading parallel-processing

下面提到的程序使用顺序,CPU并行(使用OpenMP)和GPU并行(Cuda)方法计算矢量矢量点积。 以下代码段显示了如何调用这些函数以及如何计算经过的时间。

#define SEQUENTIAL          "-s"
#define PARALLEL            "-p"
#define CUDA                "-c"
#define VERIFY              "-v"
#define TEST_AND_COMPARE    "-t"

#define GET_TIME(x); if (clock_gettime(CLOCK_MONOTONIC, &(x)) < 0)  {   perror("clock_gettime( ):");exit(EXIT_FAILURE);}

int main(int argc, char **argv) {

    struct timespec t1, t2, t3, t4;
    unsigned long sec, nsec;
    float comp_time;

    //invoking the sequential version
    if (!strcmp(argv[1], SEQUENTIAL)) {
        GET_TIME(t1);
        sequentialVersion();
        GET_TIME(t2);
        comp_time = elapsed_time_msec(&t1, &t2, &sec, &nsec);
        printf("N=%d: Time(ms)=%.5f \n", N, comp_time);
    }

    //invoking the parallel version
    else if (!strcmp(argv[1], PARALLEL)) {
        noOfThreads = atoi(argv[2]);
        GET_TIME(t1);
        parallelVersion();
        GET_TIME(t2);
        comp_time = elapsed_time_msec(&t1, &t2, &sec, &nsec);
        printf("N=%d: Threads=%d: Time(ms)=%.5f \n", N, noOfThreads,
                comp_time);
    }

    //the cuda invoke goes here...

    //comparing the answers received by each method of calculation
    else if (!strcmp(argv[1], TEST_AND_COMPARE)) {

        precision answer1, answer2, answer3;

        GET_TIME(t1);
        answer1 = sequentialVersion();
        GET_TIME(t2);
        comp_time = elapsed_time_msec(&t1, &t2, &sec, &nsec);
        printf("%-10s\tN=%d: Ans=%f: Time(ms)=%.5f \n", "Serial", N, answer1, comp_time);

        noOfThreads = atoi(argv[2]);
        GET_TIME(t3);
        answer2 = parallelVersion();
        GET_TIME(t4);
        comp_time = elapsed_time_msec(&t3, &t4, &sec, &nsec);
        printf("%-10s\tN=%d: Ans=%f: Time(ms)=%.5f Threads=%d \n", "Parallel",  N, answer2, comp_time, noOfThreads);
    }
}

float elapsed_time_msec(struct timespec *begin, struct timespec *end,
        unsigned long *sec, unsigned long *nsec) {
    if (end->tv_nsec < begin->tv_nsec) {
        *nsec = 1000000000 - (begin->tv_nsec - end->tv_nsec);
        *sec = end->tv_sec - begin->tv_sec - 1;
    } else {
        *nsec = end->tv_nsec - begin->tv_nsec;
        *sec = end->tv_sec - begin->tv_sec;
    }
    return (float) (*sec) * 1000 + ((float) (*nsec)) / 1000000;
}

上述程序的Makefile如下。

#specifying single or double precision
ifeq ($(double),)
    precision= 
else
    precision=-D USE_DOUBLES
endif

#specifying the problem size
ifeq ($(N),)
    problem-size=-D PROBLEM_SIZE=1000000
else
    problem-size=-D PROBLEM_SIZE=${N}
endif

dot:
    nvcc dot-product.cu -arch compute_11 -Xcompiler -fopenmp -O3 $(problem-size) $(precision) -o prog

代码编译为make dot,默认为N,并以./prog -s运行,输出显示为

`N=1000000: Time(ms)=0.00010`

但是使用相同的N,当程序以./prog -t 6运行时,串行时间消耗显示预期的行为,如下所示

Serial      N=1000000: Ans=2249052.500000: Time(ms)=2.19174 
Parallel    N=1000000: Ans=2248955.500000: Time(ms)=0.53915 Threads=6 
Cuda        N=1000000: Ans=2248959.750000: Time(ms)=0.09935 

为什么会这样?

1 个答案:

答案 0 :(得分:2)

虽然如果你提供一个完整的代码会更好,但我相信当你运行SEQUENTIAL测试(-s)时相对于同一个函数的时间差异的解释TEST_AND_COMPARE case(-t)是由于您在每种情况下调用sequentialVersion()函数的方式,以及您指定了积极的编译器优化(-O3)的事实。 / p>

这是一个有效的测试用例,它表现出大致相同的行为差异:

$ cat t1017.cu
#include <time.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>

#define N 1000000


#define SEQUENTIAL          "-s"
#define PARALLEL            "-p"
#define CUDA                "-c"
#define VERIFY              "-v"
#define TEST_AND_COMPARE    "-t"

#define GET_TIME(x); if (clock_gettime(CLOCK_MONOTONIC, &(x)) < 0)  {   perror("clock_gettime( ):");exit(EXIT_FAILURE);}

typedef float precision;

precision sequentialVersion() {precision retval = 0.0f; for (int i=0; i<N; i++) retval += (precision)i; return retval; }
precision parallelVersion()   {sleep(1); return 0.0f;};
float elapsed_time_msec(struct timespec *begin, struct timespec *end,
        unsigned long *sec, unsigned long *nsec) {
    if (end->tv_nsec < begin->tv_nsec) {
        *nsec = 1000000000 - (begin->tv_nsec - end->tv_nsec);
        *sec = end->tv_sec - begin->tv_sec - 1;
    } else {
        *nsec = end->tv_nsec - begin->tv_nsec;
        *sec = end->tv_sec - begin->tv_sec;
    }
    return (float) (*sec) * 1000 + ((float) (*nsec)) / 1000000;
}


int main(int argc, char **argv) {

    struct timespec t1, t2, t3, t4;
    unsigned long sec, nsec;
    float comp_time;
    int noOfThreads;

    //invoking the sequential version
    if (!strcmp(argv[1], SEQUENTIAL)) {
        GET_TIME(t1);
        sequentialVersion();
        GET_TIME(t2);
        comp_time = elapsed_time_msec(&t1, &t2, &sec, &nsec);
        printf("N=%d: Time(ms)=%.5f \n", N, comp_time);
    }

    //invoking the parallel version
    else if (!strcmp(argv[1], PARALLEL)) {
        noOfThreads = atoi(argv[2]);
        GET_TIME(t1);
        parallelVersion();
        GET_TIME(t2);
        comp_time = elapsed_time_msec(&t1, &t2, &sec, &nsec);
        printf("N=%d: Threads=%d: Time(ms)=%.5f \n", N, noOfThreads,
                comp_time);
    }

    //the cuda invoke goes here...

    //comparing the answers received by each method of calculation
    else if (!strcmp(argv[1], TEST_AND_COMPARE)) {

        precision answer1, answer2, answer3;

        GET_TIME(t1);
        answer1 = sequentialVersion();
        GET_TIME(t2);
        comp_time = elapsed_time_msec(&t1, &t2, &sec, &nsec);
        printf("%-10s\tN=%d: Ans=%f: Time(ms)=%.5f \n", "Serial", N, answer1, comp_time);

        noOfThreads = atoi(argv[2]);
        GET_TIME(t3);
        answer2 = parallelVersion();
        GET_TIME(t4);
        comp_time = elapsed_time_msec(&t3, &t4, &sec, &nsec);
        printf("%-10s\tN=%d: Ans=%f: Time(ms)=%.5f Threads=%d \n", "Parallel",  N, answer2, comp_time, noOfThreads);
    }
}


$ nvcc -o t1017 t1017.cu
$ ./t1017 -s
N=1000000: Time(ms)=3.61435
$ nvcc -O3 -o t1017 t1017.cu
$ ./t1017 -s
N=1000000: Time(ms)=0.00068
$ ./t1017 -t 6
Serial          N=1000000: Ans=499940360192.000000: Time(ms)=1.40843
Parallel        N=1000000: Ans=0.000000: Time(ms)=1000.16150 Threads=6
$

请注意,在未指定优化的情况下编译代码时,-s情况下的时序为几毫秒。当我们在-O3情况下编译时,-s测试中的时间几乎为零,但在-t测试中仍然是几毫秒。

要解决此问题,只需将sequentialVersion()函数的返回值分配(不要忽略),无论您在何处使用它。而不是:

    sequentialVersion();

这样做:

    precision temp = sequentialVersion();

您可能还想打印或以其他方式使用&#34;稍后的temp值。通过这样做,编译器无法优化顺序代码。

正如已经指出的那样,这个问题与CUDA无关。您可以使用我显示的代码,将其放在.cpp文件而不是.cu文件中,并使用g ++而不是nvcc进行编译,并且可以看到相同的特征。由于代码中没有设备代码,因此无论如何nvcc都会将其交给主机编译器。

宏免责声明:

虽然使用您展示的特定计时宏可能存在风格问题和/或危险,但我相信:

  1. 它不会影响您在此问题中实际询问的问题
  2. 它确实以我认为的方式工作,在这个特定的程序中以这种特殊的方式使用。
  3. 由于我不相信特定的宏正在影响这一特定问题,因此我选择将其保留为原样,以证明问题可以修复(在此测试用例中),而无需修改时间宏。当我选择进行基于主机的计时时,我通常使用普通函数,例如我已经演示here。如果您对与该特定宏相关的问题有疑问,可能需要将其作为单独的问题提出。我认为不需要使用cuda标记。