在群集上的Vector Addition Cuda程序给出了许多错误

时间:2017-06-20 10:10:58

标签: cuda nvidia

我正在尝试在Tesla K20服务器上运行Cuda矢量添加程序,我遇到了很多错误。我正在提交代码。

#include <stdio.h>
#include <stdlib.h>
#include "cuda_utils.h"
#include "timer.h"
/*
* **CUDA KERNEL** 
* 
* Compute the sum of two vectors 
*   C[i] = A[i] + B[i]
* 
*/
__global__ void vecAdd(float* a, float* b, float* c) {

/* Calculate index for this thread */
  int i = blockIdx.x * blockDim.x + threadIdx.x;

 /* Compute the element of C */
 c[i] = a[i] + b[i];
 }

 void compute_vec_add(int N, float *a, float* b, float *c);

/*
* 
* Host code to drive the CUDA Kernel
* 
*/
int main() {

float *d_a, *d_b, *d_c;
float *h_a, *h_b, *h_c, *h_temp;
int i;
int N = 1024 * 1024 * 512;

struct stopwatch_t* timer = NULL;
long double t_pcie_htd, t_pcie_dth, t_kernel, t_cpu;

/* Setup timers */
stopwatch_init();
timer = stopwatch_create();

/*
Create the vectors
*/
h_a = (float *) malloc(sizeof(float) * N);
h_b = (float *) malloc(sizeof(float) * N);
h_c = (float *) malloc(sizeof(float) * N);

/*
 Set the initial values of h_a, h_b, and h_c
 */
for (i = 0; i < N; i++) {
h_a[i] = (float) (rand() % 100) / 10.0;
h_b[i] = (float) (rand() % 100) / 10.0;
h_c[i] = 0.0;
}

/*
Allocate space on the GPU
*/
CUDA_CHECK_ERROR(cudaMalloc(&d_a, sizeof(float) * N));
CUDA_CHECK_ERROR(cudaMalloc(&d_b, sizeof(float) * N));
CUDA_CHECK_ERROR(cudaMalloc(&d_c, sizeof(float) * N));

/*
Copy d_a and d_b from CPU to GPU
*/
stopwatch_start(timer);
CUDA_CHECK_ERROR(cudaMemcpy(d_a, h_a, sizeof(float) * N, cudaMemcpyHostToDevice));
CUDA_CHECK_ERROR(cudaMemcpy(d_b, h_b, sizeof(float) * N, cudaMemcpyHostToDevice));
t_pcie_htd = stopwatch_stop(timer);
fprintf(stderr, "Time to transfer data from host to device: %Lg secs\n",t_pcie_htd);

/*
Run N/256 blocks of 256 threads each
*/
dim3 GS(N / 256, 1, 1);
dim3 BS(256, 1, 1);

stopwatch_start(timer);
vecAdd<<<GS, BS>>>(d_a, d_b, d_c);
cudaThreadSynchronize();
t_kernel = stopwatch_stop(timer);
fprintf(stderr, "Time to execute GPU kernel: %Lg secs\n", t_kernel);

/*
Copy d_cfrom GPU to CPU
*/
stopwatch_start(timer);
CUDA_CHECK_ERROR(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
t_pcie_dth = stopwatch_stop(timer);
fprintf(stderr, "Time to transfer data from device to host: %Lg secs\n",t_pcie_dth);

/* 
 Double check errors
 */
h_temp = (float *) malloc(sizeof(float) * N);
stopwatch_start(timer);
compute_vec_add(N, h_a, h_b, h_temp);
t_cpu = stopwatch_stop(timer);
fprintf(stderr, "Time to execute CPU program: %Lg secs\n", t_cpu);

int cnt = 0;
for (int i = 0; i < N; i++) {
if (abs(h_temp[i] - h_c[i]) > 1e-5)
  cnt++;
}
fprintf(stderr, "number of errors: %d out of %d\n", cnt, N);

/*
 Free the device memory
*/
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

/*
 Free the host memory
*/
free(h_a);
free(h_b);
free(h_c);

/* 
 Free timer 
*/
stopwatch_destroy(timer);

if (cnt == 0) {
printf("\n\nSuccess\n");
}
}

现在,对于我正在运行的代码,我得到了一个巨大的错误列表。我在同一目录中有timer.c和cuda_utils.h。 对于编译,

nvcc vecAdd.cu timer.c -o vecAdd

然后我得到的错误是:

/tmp/tmpxft_000014db_00000000-17_vecAdd.o: In function `main':
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x7e): undefined reference to `stopwatch_init()'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x83): undefined reference to `stopwatch_create()'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x278): undefined reference to `stopwatch_start(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x2ff): undefined reference to `stopwatch_stop(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x380): undefined reference to `stopwatch_start(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x3dc): undefined reference to `stopwatch_stop(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x416): undefined reference to `stopwatch_start(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x45e): undefined reference to `stopwatch_stop(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x4b0): undefined reference to `stopwatch_start(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x4de): undefined reference to `stopwatch_stop(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x61f): undefined reference to `stopwatch_destroy(stopwatch_t*)'
collect2: error: ld returned 1 exit status

有人可以说明为什么会出现这些错误。此外,我是Cuda编程的初学者。 我的猜测是它与链接有关。

1 个答案:

答案 0 :(得分:1)

nvcc.cu代码解释为C ++代码,这将导致与符号名称冲突。解决方案是将#include "timer.h"extern "C" {}括起来vecAdd.cu

  

问题是如果.cu文件包含来自另一个.c文件的C函数,比如说function.c,那些函数将被解释为C ++函数,而为这些函数设置特殊的符号名称< / strong>即可。稍后,在编译function.c时,这些函数将使用普通符号名编译。在链接阶段,由于.cu文件中的符号名称与编译的function.o文件中的符号名称不匹配,因此您将获得未解析的引用。所以你需要在包括外部C代码函数在内的标题中使用extern&#34; C&#34; {}语法。

(引自here

<强>验证

使用vecAdd.cu编制问题中提供的nvcc -c vecAdd.cu,并使用nm vecAdd.o列出符号,以下行:

...
00000000000007cf t _Z10cudaLaunchIcE9cudaErrorPT_
00000000000007aa t _Z10cudaMallocIfE9cudaErrorPPT_m
                 U _Z14stopwatch_initv
                 U _Z14stopwatch_stopP11stopwatch_t
0000000000000016 T _Z15compute_vec_addiPfS_S_
                 U _Z15stopwatch_startP11stopwatch_t
                 U _Z16stopwatch_createv
                 U _Z17stopwatch_destroyP11stopwatch_t
0000000000000672 T _Z29__device_stub__Z6vecAddPfS_S_PfS_S_
0000000000000703 T _Z6vecAddPfS_S_
...

您可以看到stopwatch_init变为_Z14stopwatch_iniv,依此类推。

由于timer.ctimer.h没有定义,我为它们编写了一个最小代码。

// timer.h
struct stopwatch_t { double t; };
void stopwatch_init();
struct stopwatch_t *stopwatch_create();
void stopwatch_start(struct stopwatch_t *timer);
long double stopwatch_stop(struct stopwatch_t *timer);
void stopwatch_destroy(struct stopwatch_t *timer);

// timer.c
void stopwatch_init() { }
struct stopwatch_t *stopwatch_create() { return 0; }
void stopwatch_start(struct stopwatch_t *timer) { }
long double stopwatch_stop(struct stopwatch_t *timer) { return 0; }
void stopwatch_destroy(struct stopwatch_t *timer) { }

使用上面的代码,nvcc -c timer.cnm timer.o会产生:

0000000000000007 T stopwatch_create
0000000000000029 T stopwatch_destroy
0000000000000000 T stopwatch_init
0000000000000012 T stopwatch_start
000000000000001d T stopwatch_stop

您可以看到timer.c函数的符号名称冲突。

vecAdd.cu变为:

之后
#include <stdio.h>
#include <stdlib.h>
#include "cuda_utils.h"
extern "C" {
    #include "timer.h"
}
...

nvcc -c vecAdd.cunm vecAdd.o会产生:

...
                 U __stack_chk_fail
                 U stderr
                 U stopwatch_create
                 U stopwatch_destroy
                 U stopwatch_init
                 U stopwatch_start
                 U stopwatch_stop
00000000000007cf t _Z10cudaLaunchIcE9cudaErrorPT_
00000000000007aa t _Z10cudaMallocIfE9cudaErrorPPT_m
...

您可以看到C函数的符号名称没有变化。在这种情况下,问题nvcc vecAdd.cu timer.c -o vecAdd中的编译命令将起作用。

修改

正如OP的评论所述,g++ -c timer.cnvcc vecAdd.cu timer.o -o vecAdd也可以使用,因为g++默认会将.c文件视为C ++代码。

g++ -c timer.cnm timer.o打印:

0000000000000000 T _Z14stopwatch_initv
000000000000001d T _Z14stopwatch_stopP11stopwatch_t
0000000000000012 T _Z15stopwatch_startP11stopwatch_t
0000000000000007 T _Z16stopwatch_createv
0000000000000029 T _Z17stopwatch_destroyP11stopwatch_t