Question

这是下面的代码：

#include <stdio.h>
#include <cuda.h>
#include <device_launch_parameters.h>
#include <cuda_runtime.h>
#define _crt_nonstdc_no_deprecate


__global__ void Kernel(float *d_arr_i,float *d_arr_o)
{
    int i = threadIdx.x;
    int j = threadIdx.y;
    int k = threadIdx.z;
    float f = d_arr_i[i];
    d_arr_o[i] = f * f;
    printf("%d \n",f);
    printf("x = %d & y = %d & z = %d \n",i,j,k);
}

int main ()
{   
    //printf("Hello C..!");
    const unsigned int arr_s = 12;

    float h_arr_i[arr_s];
    for (int i = 0;i < arr_s;i++)
    {
        h_arr_i[i] = float(i);
    }
    for (int i = 0;i<arr_s;i++)
    {
        printf("input arr %d : %e \n",(int)i,h_arr_i[i]);
    }
    float h_arr_o[arr_s];

    float* d_arr_i;
    float* d_arr_o;
    const unsigned int d_arr_s = arr_s*sizeof(float);

    cudaMalloc((void**)&d_arr_i,d_arr_s);
    cudaMalloc((void**)&d_arr_o,d_arr_s);

    cudaMemcpy(d_arr_i,h_arr_i,d_arr_s,cudaMemcpyHostToDevice);
    Kernel<<<1,arr_s>>>(d_arr_i,d_arr_o);
    cudaMemcpy(h_arr_o,d_arr_o,d_arr_s,cudaMemcpyDeviceToHost);

    printf("\n");
    for (int i = 0;i < arr_s;i++)
    {
        printf("output arr : %d \n",h_arr_o[i]);
    }
    int d;
    cudaDeviceProp c;
    int e;
    cudaGetDeviceProperties(&c,e);
    printf("\n %e",e);
    cudaGetDeviceCount(&d);
    printf("\n %d \n",d);
    cudaFree(d_arr_i);
    cudaFree(d_arr_o);
    system("Pause");
    return 0;
}

$![Console Output](C:\Untitled.png)$

在上面的代码中，“output arr”应该给出数组中数字的平方，但它不会。

有人可以解释为什么会发生这种错误吗？一些细节，我正在运行代码： CUDA 2.3 /仿真模式/没有NVIDIA GPU 操作系统：Windows 7 64位 Visual Studio 2005 SP1

Answer 1

您使用的是错误的printf格式说明符。此错误发生两次，一次在内核中，一次在代码结束时输出打印输出。

而不是%d，您应该使用%f。当我对您的代码进行更改时，使用CUDA 5.0在真正的GPU上运行时，我得到了正确的结果：

$ ./t212
input arr 0 : 0.000000e+00
input arr 1 : 1.000000e+00
input arr 2 : 2.000000e+00
input arr 3 : 3.000000e+00
input arr 4 : 4.000000e+00
input arr 5 : 5.000000e+00
input arr 6 : 6.000000e+00
input arr 7 : 7.000000e+00
input arr 8 : 8.000000e+00
input arr 9 : 9.000000e+00
input arr 10 : 1.000000e+01
input arr 11 : 1.100000e+01
0.000000
1.000000
2.000000
3.000000
4.000000
5.000000
6.000000
7.000000
8.000000
9.000000
10.000000
11.000000
x = 0 & y = 0 & z = 0
x = 1 & y = 0 & z = 0
x = 2 & y = 0 & z = 0
x = 3 & y = 0 & z = 0
x = 4 & y = 0 & z = 0
x = 5 & y = 0 & z = 0
x = 6 & y = 0 & z = 0
x = 7 & y = 0 & z = 0
x = 8 & y = 0 & z = 0
x = 9 & y = 0 & z = 0
x = 10 & y = 0 & z = 0
x = 11 & y = 0 & z = 0

output arr : 0.000000
output arr : 1.000000
output arr : 4.000000
output arr : 9.000000
output arr : 16.000000
output arr : 25.000000
output arr : 36.000000
output arr : 49.000000
output arr : 64.000000
output arr : 81.000000
output arr : 100.000000
output arr : 121.000000

显然错误导致CUDA仿真

1 个答案: