我正在努力学习cuda。我正在尝试运行一个简单的代码
#include <stdlib.h>
#include <stdio.h>
__global__ void kernel(int *array)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
array[index] = 7;
}
int main(void)
{
int num_elements = 256;
int num_bytes = num_elements * sizeof(int);
// pointers to host & device arrays
int *device_array = 0;
int *host_array = 0;
// malloc a host array
host_array = (int*)malloc(num_bytes);
// cudaMalloc a device array
cudaMalloc((void**)&device_array, num_bytes);
int block_size = 128;
int grid_size = num_elements / block_size;
kernel<<<grid_size,block_size>>>(device_array);
// download and inspect the result on the host:
cudaMemcpy(host_array, device_array, num_bytes, cudaMemcpyDeviceToHost);
// print out the result element by element
for(int i=0; i < num_elements; ++i)
{
printf("%d ", host_array[i]);
}
// deallocate memory
free(host_array);
cudaFree(device_array);
}
它应该打印7,但它打印0 这句话似乎没有被执行 “内核&LT;&LT;&GT;&GT;(device_array);” 它也没有给出任何编译错误。 任何帮助?
答案 0 :(得分:2)
代码在我的机器上运行正常,但请确保在内核调用后添加cudaDeviceSynchronize
和错误检查。
按如下所示更改代码以检查错误:
kernel<<<grid_size,block_size>>>(device_array);
// wait until tasks are completed
cudaDeviceSynchronize();
// check for errors
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
fprintf(stderr, "ERROR: %s \n", cudaGetErrorString(error));
}