我基于本教程创建了一个简单但完整的程序:fixed now
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
//Kernel definition
__global__ void VecAdd(float* A, float* B, float* C,int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < N)
C[i] = A[i] + B[i];
}
//Host code
int main()
{
int N = 1000;
int i;
FILE *f;
size_t size = N * sizeof(float);
//allocate input vectors h_A and h_B in host memory
float *h_A = (float*)malloc(size);
float *h_B = (float*)malloc(size);
float *h_C = (float*)malloc(size);
//Initialize input vectors
f = fopen("A.txt","r");
for(i=0;i<N;i++)
fscanf(f,"%f ",&h_A[i]);
fclose(f);
f = fopen("B.txt","r");
for(i=0;i<N;i++)
fscanf(f,"%f ",&h_B[i]);
fclose(f);
//Allocate vactors in device memory
float *d_A;
gpuErrchk(cudaMalloc(&d_A,size));
float *d_B;
cudaMalloc(&d_B,size);
float *d_C;
cudaMalloc(&d_C,size);
gpuErrchk(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
//invoke kernel
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
f = fopen("C.txt","w");
printf("%f \n",h_C[i]);
for(i=0;i<1000;i++)
fprintf(f,"%f ",h_C[i]);
fclose(f);
printf("Zakonczono obliczenia\n");
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
//Free host memory
free(h_A);
free(h_B);
return 0;
}
它应该从文件中读取两个向量,将它们添加到设备上,然后将输出打印到“C.txt”文件中。但是,它打印了一千个零。
经过一些调试后,我找到了罪魁祸首 - cudaMalloc功能。
(cuda-gdb) n 42 cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); (cuda-gdb) n 43 cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); (cuda-gdb) print d_A[0] $1 = 0 (cuda-gdb) print h_A[0] $2 = 3.66192293
我想知道为什么它不起作用,这部分代码是从教程中原始复制的。
答案 0 :(得分:0)
似乎我必须在安装CUDA后重新启动计算机,不仅要注销并登录。之后我的程序正常运行,但我无法再调试它。有
[使用libthread_db启用线程调试]使用主机libthread_db 库“/lib/x86_64-linux-gnu/libthread_db.so.1”。致命:所有CUDA 设备用于显示,调试时无法使用。 (错误代码= 24)。
在'run'之后。