我正在尝试在Tesla K20服务器上运行Cuda矢量添加程序,我遇到了很多错误。我正在提交代码。
#include <stdio.h>
#include <stdlib.h>
#include "cuda_utils.h"
#include "timer.h"
/*
* **CUDA KERNEL**
*
* Compute the sum of two vectors
* C[i] = A[i] + B[i]
*
*/
__global__ void vecAdd(float* a, float* b, float* c) {
/* Calculate index for this thread */
int i = blockIdx.x * blockDim.x + threadIdx.x;
/* Compute the element of C */
c[i] = a[i] + b[i];
}
void compute_vec_add(int N, float *a, float* b, float *c);
/*
*
* Host code to drive the CUDA Kernel
*
*/
int main() {
float *d_a, *d_b, *d_c;
float *h_a, *h_b, *h_c, *h_temp;
int i;
int N = 1024 * 1024 * 512;
struct stopwatch_t* timer = NULL;
long double t_pcie_htd, t_pcie_dth, t_kernel, t_cpu;
/* Setup timers */
stopwatch_init();
timer = stopwatch_create();
/*
Create the vectors
*/
h_a = (float *) malloc(sizeof(float) * N);
h_b = (float *) malloc(sizeof(float) * N);
h_c = (float *) malloc(sizeof(float) * N);
/*
Set the initial values of h_a, h_b, and h_c
*/
for (i = 0; i < N; i++) {
h_a[i] = (float) (rand() % 100) / 10.0;
h_b[i] = (float) (rand() % 100) / 10.0;
h_c[i] = 0.0;
}
/*
Allocate space on the GPU
*/
CUDA_CHECK_ERROR(cudaMalloc(&d_a, sizeof(float) * N));
CUDA_CHECK_ERROR(cudaMalloc(&d_b, sizeof(float) * N));
CUDA_CHECK_ERROR(cudaMalloc(&d_c, sizeof(float) * N));
/*
Copy d_a and d_b from CPU to GPU
*/
stopwatch_start(timer);
CUDA_CHECK_ERROR(cudaMemcpy(d_a, h_a, sizeof(float) * N, cudaMemcpyHostToDevice));
CUDA_CHECK_ERROR(cudaMemcpy(d_b, h_b, sizeof(float) * N, cudaMemcpyHostToDevice));
t_pcie_htd = stopwatch_stop(timer);
fprintf(stderr, "Time to transfer data from host to device: %Lg secs\n",t_pcie_htd);
/*
Run N/256 blocks of 256 threads each
*/
dim3 GS(N / 256, 1, 1);
dim3 BS(256, 1, 1);
stopwatch_start(timer);
vecAdd<<<GS, BS>>>(d_a, d_b, d_c);
cudaThreadSynchronize();
t_kernel = stopwatch_stop(timer);
fprintf(stderr, "Time to execute GPU kernel: %Lg secs\n", t_kernel);
/*
Copy d_cfrom GPU to CPU
*/
stopwatch_start(timer);
CUDA_CHECK_ERROR(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
t_pcie_dth = stopwatch_stop(timer);
fprintf(stderr, "Time to transfer data from device to host: %Lg secs\n",t_pcie_dth);
/*
Double check errors
*/
h_temp = (float *) malloc(sizeof(float) * N);
stopwatch_start(timer);
compute_vec_add(N, h_a, h_b, h_temp);
t_cpu = stopwatch_stop(timer);
fprintf(stderr, "Time to execute CPU program: %Lg secs\n", t_cpu);
int cnt = 0;
for (int i = 0; i < N; i++) {
if (abs(h_temp[i] - h_c[i]) > 1e-5)
cnt++;
}
fprintf(stderr, "number of errors: %d out of %d\n", cnt, N);
/*
Free the device memory
*/
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
/*
Free the host memory
*/
free(h_a);
free(h_b);
free(h_c);
/*
Free timer
*/
stopwatch_destroy(timer);
if (cnt == 0) {
printf("\n\nSuccess\n");
}
}
现在,对于我正在运行的代码,我得到了一个巨大的错误列表。我在同一目录中有timer.c和cuda_utils.h。 对于编译,
nvcc vecAdd.cu timer.c -o vecAdd
然后我得到的错误是:
/tmp/tmpxft_000014db_00000000-17_vecAdd.o: In function `main':
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x7e): undefined reference to `stopwatch_init()'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x83): undefined reference to `stopwatch_create()'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x278): undefined reference to `stopwatch_start(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x2ff): undefined reference to `stopwatch_stop(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x380): undefined reference to `stopwatch_start(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x3dc): undefined reference to `stopwatch_stop(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x416): undefined reference to `stopwatch_start(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x45e): undefined reference to `stopwatch_stop(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x4b0): undefined reference to `stopwatch_start(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x4de): undefined reference to `stopwatch_stop(stopwatch_t*)'
tmpxft_000014db_00000000-4_vecAdd.cudafe1.cpp:(.text+0x61f): undefined reference to `stopwatch_destroy(stopwatch_t*)'
collect2: error: ld returned 1 exit status
有人可以说明为什么会出现这些错误。此外,我是Cuda编程的初学者。 我的猜测是它与链接有关。
答案 0 :(得分:1)
nvcc
将.cu
代码解释为C ++代码,这将导致与符号名称冲突。解决方案是将#include "timer.h"
与extern "C" {}
括起来vecAdd.cu
。
问题是如果.cu文件包含来自另一个.c文件的C函数,比如说function.c,那些函数将被解释为C ++函数,而为这些函数设置特殊的符号名称< / strong>即可。稍后,在编译function.c时,这些函数将使用普通符号名编译。在链接阶段,由于.cu文件中的符号名称与编译的function.o文件中的符号名称不匹配,因此您将获得未解析的引用。所以你需要在包括外部C代码函数在内的标题中使用extern&#34; C&#34; {}语法。
(引自here)
<强>验证强>
使用vecAdd.cu
编制问题中提供的nvcc -c vecAdd.cu
,并使用nm vecAdd.o
列出符号,以下行:
...
00000000000007cf t _Z10cudaLaunchIcE9cudaErrorPT_
00000000000007aa t _Z10cudaMallocIfE9cudaErrorPPT_m
U _Z14stopwatch_initv
U _Z14stopwatch_stopP11stopwatch_t
0000000000000016 T _Z15compute_vec_addiPfS_S_
U _Z15stopwatch_startP11stopwatch_t
U _Z16stopwatch_createv
U _Z17stopwatch_destroyP11stopwatch_t
0000000000000672 T _Z29__device_stub__Z6vecAddPfS_S_PfS_S_
0000000000000703 T _Z6vecAddPfS_S_
...
您可以看到stopwatch_init
变为_Z14stopwatch_iniv
,依此类推。
由于timer.c
和timer.h
没有定义,我为它们编写了一个最小代码。
// timer.h
struct stopwatch_t { double t; };
void stopwatch_init();
struct stopwatch_t *stopwatch_create();
void stopwatch_start(struct stopwatch_t *timer);
long double stopwatch_stop(struct stopwatch_t *timer);
void stopwatch_destroy(struct stopwatch_t *timer);
// timer.c
void stopwatch_init() { }
struct stopwatch_t *stopwatch_create() { return 0; }
void stopwatch_start(struct stopwatch_t *timer) { }
long double stopwatch_stop(struct stopwatch_t *timer) { return 0; }
void stopwatch_destroy(struct stopwatch_t *timer) { }
使用上面的代码,nvcc -c timer.c
和nm timer.o
会产生:
0000000000000007 T stopwatch_create
0000000000000029 T stopwatch_destroy
0000000000000000 T stopwatch_init
0000000000000012 T stopwatch_start
000000000000001d T stopwatch_stop
您可以看到timer.c
函数的符号名称冲突。
将vecAdd.cu
变为:
#include <stdio.h>
#include <stdlib.h>
#include "cuda_utils.h"
extern "C" {
#include "timer.h"
}
...
nvcc -c vecAdd.cu
和nm vecAdd.o
会产生:
...
U __stack_chk_fail
U stderr
U stopwatch_create
U stopwatch_destroy
U stopwatch_init
U stopwatch_start
U stopwatch_stop
00000000000007cf t _Z10cudaLaunchIcE9cudaErrorPT_
00000000000007aa t _Z10cudaMallocIfE9cudaErrorPPT_m
...
您可以看到C函数的符号名称没有变化。在这种情况下,问题nvcc vecAdd.cu timer.c -o vecAdd
中的编译命令将起作用。
修改强>
正如OP的评论所述,g++ -c timer.c
和nvcc vecAdd.cu timer.o -o vecAdd
也可以使用,因为g++
默认会将.c
文件视为C ++代码。
g++ -c timer.c
和nm timer.o
打印:
0000000000000000 T _Z14stopwatch_initv
000000000000001d T _Z14stopwatch_stopP11stopwatch_t
0000000000000012 T _Z15stopwatch_startP11stopwatch_t
0000000000000007 T _Z16stopwatch_createv
0000000000000029 T _Z17stopwatch_destroyP11stopwatch_t