我从http://llpanorama.wordpress.com/2008/05/21/my-first-cuda-program/
找到了以下程序不幸的是我无法复制粘贴它,因为代码变得混乱
它将数字向量作为输入,然后将矢量乘以自身作为输出,我在我安装在计算机上的模拟器上运行它,它提供以下输出:
0 0.000000
1 1.000000
2 4.000000
3 9.000000
4 16.000000
5 25.000000
6 36.000000
7 49.000000
8 64.000000
9 81.000000
然而,如果我决定在运行debian的远程计算机上运行它并且通过输入
来使用cuda兼容的gpunvcc test.cu -lcudart -o test
./test
它给了我以下输出
0 0.000000
1 1.000000
2 2.000000
3 3.000000
4 4.000000
5 5.000000
6 6.000000
7 7.000000
8 8.000000
9 9.000000
为什么会这样?提前谢谢!
答案 0 :(得分:3)
问题是代码没有错误检查,并且远程计算机出现了问题。将error checking添加到该代码(这并不难),重新运行它,然后看看会发生什么。如果您仍有问题,请回复。
以下是使用错误检查进行适当修改的代码:
// example1.cpp : Defines the entry point for the console application.
//
#include <stdio.h>
#include <cuda.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
// main routine that executes on the host
int main(void)
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
cudaCheckErrors("cudaMalloc fail");
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy 1 fail");
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2 fail");
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
}