下面提到的程序使用顺序,CPU并行(使用OpenMP)和GPU并行(Cuda)方法计算矢量矢量点积。 以下代码段显示了如何调用这些函数以及如何计算经过的时间。
#define SEQUENTIAL "-s"
#define PARALLEL "-p"
#define CUDA "-c"
#define VERIFY "-v"
#define TEST_AND_COMPARE "-t"
#define GET_TIME(x); if (clock_gettime(CLOCK_MONOTONIC, &(x)) < 0) { perror("clock_gettime( ):");exit(EXIT_FAILURE);}
int main(int argc, char **argv) {
struct timespec t1, t2, t3, t4;
unsigned long sec, nsec;
float comp_time;
//invoking the sequential version
if (!strcmp(argv[1], SEQUENTIAL)) {
GET_TIME(t1);
sequentialVersion();
GET_TIME(t2);
comp_time = elapsed_time_msec(&t1, &t2, &sec, &nsec);
printf("N=%d: Time(ms)=%.5f \n", N, comp_time);
}
//invoking the parallel version
else if (!strcmp(argv[1], PARALLEL)) {
noOfThreads = atoi(argv[2]);
GET_TIME(t1);
parallelVersion();
GET_TIME(t2);
comp_time = elapsed_time_msec(&t1, &t2, &sec, &nsec);
printf("N=%d: Threads=%d: Time(ms)=%.5f \n", N, noOfThreads,
comp_time);
}
//the cuda invoke goes here...
//comparing the answers received by each method of calculation
else if (!strcmp(argv[1], TEST_AND_COMPARE)) {
precision answer1, answer2, answer3;
GET_TIME(t1);
answer1 = sequentialVersion();
GET_TIME(t2);
comp_time = elapsed_time_msec(&t1, &t2, &sec, &nsec);
printf("%-10s\tN=%d: Ans=%f: Time(ms)=%.5f \n", "Serial", N, answer1, comp_time);
noOfThreads = atoi(argv[2]);
GET_TIME(t3);
answer2 = parallelVersion();
GET_TIME(t4);
comp_time = elapsed_time_msec(&t3, &t4, &sec, &nsec);
printf("%-10s\tN=%d: Ans=%f: Time(ms)=%.5f Threads=%d \n", "Parallel", N, answer2, comp_time, noOfThreads);
}
}
float elapsed_time_msec(struct timespec *begin, struct timespec *end,
unsigned long *sec, unsigned long *nsec) {
if (end->tv_nsec < begin->tv_nsec) {
*nsec = 1000000000 - (begin->tv_nsec - end->tv_nsec);
*sec = end->tv_sec - begin->tv_sec - 1;
} else {
*nsec = end->tv_nsec - begin->tv_nsec;
*sec = end->tv_sec - begin->tv_sec;
}
return (float) (*sec) * 1000 + ((float) (*nsec)) / 1000000;
}
上述程序的Makefile
如下。
#specifying single or double precision
ifeq ($(double),)
precision=
else
precision=-D USE_DOUBLES
endif
#specifying the problem size
ifeq ($(N),)
problem-size=-D PROBLEM_SIZE=1000000
else
problem-size=-D PROBLEM_SIZE=${N}
endif
dot:
nvcc dot-product.cu -arch compute_11 -Xcompiler -fopenmp -O3 $(problem-size) $(precision) -o prog
代码编译为make dot
,默认为N,并以./prog -s
运行,输出显示为
`N=1000000: Time(ms)=0.00010`
但是使用相同的N,当程序以./prog -t 6
运行时,串行时间消耗显示预期的行为,如下所示
Serial N=1000000: Ans=2249052.500000: Time(ms)=2.19174
Parallel N=1000000: Ans=2248955.500000: Time(ms)=0.53915 Threads=6
Cuda N=1000000: Ans=2248959.750000: Time(ms)=0.09935
为什么会这样?
答案 0 :(得分:2)
虽然如果你提供一个完整的代码会更好,但我相信当你运行SEQUENTIAL
测试(-s
)时相对于同一个函数的时间差异的解释TEST_AND_COMPARE
case(-t
)是由于您在每种情况下调用sequentialVersion()
函数的方式,以及您指定了积极的编译器优化(-O3
)的事实。 / p>
这是一个有效的测试用例,它表现出大致相同的行为差异:
$ cat t1017.cu
#include <time.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#define N 1000000
#define SEQUENTIAL "-s"
#define PARALLEL "-p"
#define CUDA "-c"
#define VERIFY "-v"
#define TEST_AND_COMPARE "-t"
#define GET_TIME(x); if (clock_gettime(CLOCK_MONOTONIC, &(x)) < 0) { perror("clock_gettime( ):");exit(EXIT_FAILURE);}
typedef float precision;
precision sequentialVersion() {precision retval = 0.0f; for (int i=0; i<N; i++) retval += (precision)i; return retval; }
precision parallelVersion() {sleep(1); return 0.0f;};
float elapsed_time_msec(struct timespec *begin, struct timespec *end,
unsigned long *sec, unsigned long *nsec) {
if (end->tv_nsec < begin->tv_nsec) {
*nsec = 1000000000 - (begin->tv_nsec - end->tv_nsec);
*sec = end->tv_sec - begin->tv_sec - 1;
} else {
*nsec = end->tv_nsec - begin->tv_nsec;
*sec = end->tv_sec - begin->tv_sec;
}
return (float) (*sec) * 1000 + ((float) (*nsec)) / 1000000;
}
int main(int argc, char **argv) {
struct timespec t1, t2, t3, t4;
unsigned long sec, nsec;
float comp_time;
int noOfThreads;
//invoking the sequential version
if (!strcmp(argv[1], SEQUENTIAL)) {
GET_TIME(t1);
sequentialVersion();
GET_TIME(t2);
comp_time = elapsed_time_msec(&t1, &t2, &sec, &nsec);
printf("N=%d: Time(ms)=%.5f \n", N, comp_time);
}
//invoking the parallel version
else if (!strcmp(argv[1], PARALLEL)) {
noOfThreads = atoi(argv[2]);
GET_TIME(t1);
parallelVersion();
GET_TIME(t2);
comp_time = elapsed_time_msec(&t1, &t2, &sec, &nsec);
printf("N=%d: Threads=%d: Time(ms)=%.5f \n", N, noOfThreads,
comp_time);
}
//the cuda invoke goes here...
//comparing the answers received by each method of calculation
else if (!strcmp(argv[1], TEST_AND_COMPARE)) {
precision answer1, answer2, answer3;
GET_TIME(t1);
answer1 = sequentialVersion();
GET_TIME(t2);
comp_time = elapsed_time_msec(&t1, &t2, &sec, &nsec);
printf("%-10s\tN=%d: Ans=%f: Time(ms)=%.5f \n", "Serial", N, answer1, comp_time);
noOfThreads = atoi(argv[2]);
GET_TIME(t3);
answer2 = parallelVersion();
GET_TIME(t4);
comp_time = elapsed_time_msec(&t3, &t4, &sec, &nsec);
printf("%-10s\tN=%d: Ans=%f: Time(ms)=%.5f Threads=%d \n", "Parallel", N, answer2, comp_time, noOfThreads);
}
}
$ nvcc -o t1017 t1017.cu
$ ./t1017 -s
N=1000000: Time(ms)=3.61435
$ nvcc -O3 -o t1017 t1017.cu
$ ./t1017 -s
N=1000000: Time(ms)=0.00068
$ ./t1017 -t 6
Serial N=1000000: Ans=499940360192.000000: Time(ms)=1.40843
Parallel N=1000000: Ans=0.000000: Time(ms)=1000.16150 Threads=6
$
请注意,在未指定优化的情况下编译代码时,-s
情况下的时序为几毫秒。当我们在-O3
情况下编译时,-s
测试中的时间几乎为零,但在-t
测试中仍然是几毫秒。
要解决此问题,只需将sequentialVersion()
函数的返回值分配(不要忽略),无论您在何处使用它。而不是:
sequentialVersion();
这样做:
precision temp = sequentialVersion();
您可能还想打印或以其他方式使用&#34;稍后的temp
值。通过这样做,编译器无法优化顺序代码。
正如已经指出的那样,这个问题与CUDA无关。您可以使用我显示的代码,将其放在.cpp文件而不是.cu文件中,并使用g ++而不是nvcc进行编译,并且可以看到相同的特征。由于代码中没有设备代码,因此无论如何nvcc都会将其交给主机编译器。
宏免责声明:
虽然使用您展示的特定计时宏可能存在风格问题和/或危险,但我相信:
由于我不相信特定的宏正在影响这一特定问题,因此我选择将其保留为原样,以证明问题可以修复(在此测试用例中),而无需修改时间宏。当我选择进行基于主机的计时时,我通常使用普通函数,例如我已经演示here。如果您对与该特定宏相关的问题有疑问,可能需要将其作为单独的问题提出。我认为不需要使用cuda
标记。