Question

我正在用CUDA 6.0 C / C ++编写一个示例程序。程序可以识别设备，但在运行时似乎有错误：结果数组的元素都等于0，没有任何理由。（我的GPU：Geforce EN9400GT华硕）这是我的代码

     #include <stdio.h>
     #include <malloc.h>
     #include <cuda_runtime.h>
     #define    SIZE 1024

     __global__ void VectorAdd(int* a, int* b, int* c, int n)
     {
        int i = threadIdx.x;

    if (i < n) {
        c[i] = a[i] + b[i];
    }
}

void printResult(int* ar) {
    for (int i = 0; i < 10; i++) {
        printf("[%d] = %d\n", i, ar[i]);
    }
}

int main() {
    int *a, *b, *c;
    int *d_a, *d_b, *d_c;
    int device, count;
    cudaDeviceProp* prop = (cudaDeviceProp*)malloc(sizeof(cudaDeviceProp));

    int GPUavail = cudaGetDeviceCount(&count);
    if (GPUavail != cudaSuccess) {
        printf("There is no GPU device available\n");
        exit(EXIT_FAILURE);
    }

    cudaGetDeviceProperties(prop, device);
    printf("Device name: %s\n", prop->name);
    printf("Global memory: %zd\n", prop->totalGlobalMem);
    printf("Shared memory: %zd\n", prop->sharedMemPerBlock);
    printf("Max threads per block: %d\n", prop->maxThreadsPerBlock);
    printf("Device ID: %d\n", prop->pciDeviceID);
    printf("TCC Driver: %d\n", prop->tccDriver);

    a = (int*)malloc(SIZE * sizeof(int));
    b = (int*)malloc(SIZE * sizeof(int));
    c = (int*)malloc(SIZE * sizeof(int));

    cudaMalloc(&d_a, SIZE*sizeof(int));
    cudaMalloc(&d_b, SIZE*sizeof(int));
    cudaMalloc(&d_c, SIZE*sizeof(int));

    for (int i = 0; i < SIZE; i++) {
        a[i] = i;
        b[i] = i;
        c[i] = 0;
    }

    cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);

    VectorAdd << < 1, SIZE >> >(d_a, d_b, d_c, SIZE);

    cudaMemcpy(c, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);

    printResult(c);

    free(a);
    free(b);
    free(c);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
}

来自：https://developer.nvidia.com/how-to-cuda-c-cpp

这是显示的结果：

Answer 1

您的GPU每个块最多只能启动512个线程，正如您在程序输出中所说的那样。（Max threads per block）但是，您要在块中启动1024个线程。因为您使用无效的启动配置启动了内核，所以内核根本没有启动。您应该更改块中的线程数。

#define SIZE 512

每个块的线程限制为1024，计算能力＆gt; = 2.0，但您的GPU的计算能力为1.0。

错误的结果是CUDA 6.0

1 个答案: