Question

我正在尝试在Tesla K20服务器上运行Cuda矢量添加程序，我遇到了很多错误。我正在提交代码。

#include <stdio.h>
#include <stdlib.h>
#include "cuda_utils.h"
#include "timer.h"
/*
* **CUDA KERNEL** 
* 
* Compute the sum of two vectors 
*   C[i] = A[i] + B[i]
* 
*/
__global__ void vecAdd(float* a, float* b, float* c) {

/* Calculate index for this thread */
  int i = blockIdx.x * blockDim.x + threadIdx.x;

 /* Compute the element of C */
 c[i] = a[i] + b[i];
 }

 void compute_vec_add(int N, float *a, float* b, float *c);

/*
* 
* Host code to drive the CUDA Kernel
* 
*/
int main() {

float *d_a, *d_b, *d_c;
float *h_a, *h_b, *h_c, *h_temp;
int i;
int N = 1024 * 1024 * 512;

struct stopwatch_t* timer = NULL;
long double t_pcie_htd, t_pcie_dth, t_kernel, t_cpu;

/* Setup timers */
stopwatch_init();
timer = stopwatch_create();

/*
Create the vectors
*/
h_a = (float *) malloc(sizeof(float) * N);
h_b = (float *) malloc(sizeof(float) * N);
h_c = (float *) malloc(sizeof(float) * N);

/*
 Set the initial values of h_a, h_b, and h_c
 */
for (i = 0; i < N; i++) {
h_a[i] = (float) (rand() % 100) / 10.0;
h_b[i] = (float) (rand() % 100) / 10.0;
h_c[i] = 0.0;
}

/*
Allocate space on the GPU
*/
CUDA_CHECK_ERROR(cudaMalloc(&d_a, sizeof(float) * N));
CUDA_CHECK_ERROR(cudaMalloc(&d_b, sizeof(float) * N));
CUDA_CHECK_ERROR(cudaMalloc(&d_c, sizeof(float) * N));

/*
Copy d_a and d_b from CPU to GPU
*/
stopwatch_start(timer);
CUDA_CHECK_ERROR(cudaMemcpy(d_a, h_a, sizeof(float) * N, cudaMemcpyHostToDevice));
CUDA_CHECK_ERROR(cudaMemcpy(d_b, h_b, sizeof(float) * N, cudaMemcpyHostToDevice));
t_pcie_htd = stopwatch_stop(timer);
fprintf(stderr, "Time to transfer data from host to device: %Lg secs\n",t_pcie_htd);

/*
Run N/256 blocks of 256 threads each
*/
dim3 GS(N / 256, 1, 1);
dim3 BS(256, 1, 1);

stopwatch_start(timer);
vecAdd<<<GS, BS>>>(d_a, d_b, d_c);
cudaThreadSynchronize();
t_kernel = stopwatch_stop(timer);
fprintf(stderr, "Time to execute GPU kernel: %Lg secs\n", t_kernel);

/*
Copy d_cfrom GPU to CPU
*/
stopwatch_start(timer);
CUDA_CHECK_ERROR(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
t_pcie_dth = stopwatch_stop(timer);
fprintf(stderr, "Time to transfer data from device to host: %Lg secs\n",t_pcie_dth);

/* 
 Double check errors
 */
h_temp = (float *) malloc(sizeof(float) * N);
stopwatch_start(timer);
compute_vec_add(N, h_a, h_b, h_temp);
t_cpu = stopwatch_stop(timer);
fprintf(stderr, "Time to execute CPU program: %Lg secs\n", t_cpu);

int cnt = 0;
for (int i = 0; i < N; i++) {
if (abs(h_temp[i] - h_c[i]) > 1e-5)
  cnt++;
}
fprintf(stderr, "number of errors: %d out of %d\n", cnt, N);

/*
 Free the device memory
*/
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

/*
 Free the host memory
*/
free(h_a);
free(h_b);
free(h_c);

/* 
 Free timer 
*/
stopwatch_destroy(timer);

if (cnt == 0) {
printf("\n\nSuccess\n");
}
}

现在，对于我正在运行的代码，我得到了一个巨大的错误列表。我在同一目录中有timer.c和cuda_utils.h。对于编译，

nvcc vecAdd.cu timer.c -o vecAdd

然后我得到的错误是：

/tmp/tmpxft_000014db_00000000-17_vecAdd.o: In function `main':
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x7e): undefined reference to `stopwatch_init()'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x83): undefined reference to `stopwatch_create()'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x278): undefined reference to `stopwatch_start(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x2ff): undefined reference to `stopwatch_stop(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x380): undefined reference to `stopwatch_start(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x3dc): undefined reference to `stopwatch_stop(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x416): undefined reference to `stopwatch_start(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x45e): undefined reference to `stopwatch_stop(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x4b0): undefined reference to `stopwatch_start(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x4de): undefined reference to `stopwatch_stop(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x61f): undefined reference to `stopwatch_destroy(stopwatch_t*)'
collect2: error: ld returned 1 exit status

有人可以说明为什么会出现这些错误。此外，我是Cuda编程的初学者。我的猜测是它与链接有关。

Answer 1

nvcc将.cu代码解释为C ++代码，这将导致与符号名称冲突。解决方案是将#include "timer.h"与extern "C" {}括起来vecAdd.cu。

问题是如果.cu文件包含来自另一个.c文件的C函数，比如说function.c，那些函数将被解释为C ++函数，而为这些函数设置特殊的符号名称< / strong>即可。稍后，在编译function.c时，这些函数将使用普通符号名编译。在链接阶段，由于.cu文件中的符号名称与编译的function.o文件中的符号名称不匹配，因此您将获得未解析的引用。所以你需要在包括外部C代码函数在内的标题中使用extern＆＃34; C＆＃34; {}语法。

（引自here）

<强>验证

使用vecAdd.cu编制问题中提供的nvcc -c vecAdd.cu，并使用nm vecAdd.o列出符号，以下行：

... 00000000000007cf t _Z10cudaLaunchIcE9cudaErrorPT_ 00000000000007aa t _Z10cudaMallocIfE9cudaErrorPPT_m U _Z14stopwatch_initv U _Z14stopwatch_stopP11stopwatch_t 0000000000000016 T _Z15compute_vec_addiPfS_S_ U _Z15stopwatch_startP11stopwatch_t U _Z16stopwatch_createv U _Z17stopwatch_destroyP11stopwatch_t 0000000000000672 T _Z29__device_stub__Z6vecAddPfS_S_PfS_S_ 0000000000000703 T _Z6vecAddPfS_S_ ...

您可以看到stopwatch_init变为_Z14stopwatch_iniv，依此类推。

由于timer.c和timer.h没有定义，我为它们编写了一个最小代码。

// timer.h struct stopwatch_t { double t; }; void stopwatch_init(); struct stopwatch_t *stopwatch_create(); void stopwatch_start(struct stopwatch_t *timer); long double stopwatch_stop(struct stopwatch_t *timer); void stopwatch_destroy(struct stopwatch_t *timer); // timer.c void stopwatch_init() { } struct stopwatch_t *stopwatch_create() { return 0; } void stopwatch_start(struct stopwatch_t *timer) { } long double stopwatch_stop(struct stopwatch_t *timer) { return 0; } void stopwatch_destroy(struct stopwatch_t *timer) { }

使用上面的代码，nvcc -c timer.c和nm timer.o会产生：

0000000000000007 T stopwatch_create 0000000000000029 T stopwatch_destroy 0000000000000000 T stopwatch_init 0000000000000012 T stopwatch_start 000000000000001d T stopwatch_stop

您可以看到timer.c函数的符号名称冲突。

将vecAdd.cu变为：
之后
#include <stdio.h> #include <stdlib.h> #include "cuda_utils.h" extern "C" { #include "timer.h" } ...

nvcc -c vecAdd.cu和nm vecAdd.o会产生：

... U __stack_chk_fail U stderr U stopwatch_create U stopwatch_destroy U stopwatch_init U stopwatch_start U stopwatch_stop 00000000000007cf t _Z10cudaLaunchIcE9cudaErrorPT_ 00000000000007aa t _Z10cudaMallocIfE9cudaErrorPPT_m ...

您可以看到C函数的符号名称没有变化。在这种情况下，问题nvcc vecAdd.cu timer.c -o vecAdd中的编译命令将起作用。

修改

正如OP的评论所述，g++ -c timer.c和nvcc vecAdd.cu timer.o -o vecAdd也可以使用，因为g++默认会将.c文件视为C ++代码。

g++ -c timer.c和nm timer.o打印：

0000000000000000 T _Z14stopwatch_initv 000000000000001d T _Z14stopwatch_stopP11stopwatch_t 0000000000000012 T _Z15stopwatch_startP11stopwatch_t 0000000000000007 T _Z16stopwatch_createv 0000000000000029 T _Z17stopwatch_destroyP11stopwatch_t

在群集上的Vector Addition Cuda程序给出了许多错误

1 个答案: