Cuda基本程序(将值写入矩阵和std:cout不起作用);主要功能无法启动

时间:2018-09-18 09:13:42

标签: c++ cuda std cout

我写了一个非常简单的Cuda程序。我想为设备内存中的矩阵分配值。然后,我想将值复制到主机并显示它们。我编写的程序无法正常工作。但是我不知道为什么。我试图通过显示cout的状态来弄清楚我做错了什么,但是即使这样做也不起作用,所以我认为主要功能没有启动。

有人知道出什么问题吗?

这是我的代码:

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <iostream>
    #include <stdio.h>
    const int N = 1024;


    __global__ void matrix(float *d_A)
    {
        int col = blockIdx.x * blockDim.x + threadIdx.x;
        int row = blockIdx.y * blockDim.y + threadIdx.y;


        int index = col + row * N;
        if (col < N && row < N)
        {
            d_A[index] = 255;
        }
    }
    int main()
    {
        std::cout << "Programm begins";
        float A[N * N];
        float d_A[N * N];

        cudaMalloc((void**)&d_A, (N * N)*sizeof(float));
        std::cout << "Matrizes allocated";
        std::cout << A[0] << " , " << A[1] << " , " << A[2] << " , " << A[3] << " , " << A[4] << " , " << A[5] << "\n";
        std::cout << A[1024] << " , " << A[1025] << " , " << A[1026] << " , " << A[1027] << " , " << A[1028] << " , " << A[1029] << "\n";
        matrix << <1024, 1024 >> >(d_A);
        std::cout << "Wrote Matrix to local device memory";
        std::cout << d_A[0] << " , " << d_A[1] << " , " << d_A[2] << " , " << d_A[3] << " , " << d_A[4] << " , " << d_A[5] << "\n";
        std::cout << d_A[1024] << " , " << d_A[1025] << " , " << d_A[1026] << " , " << d_A[1027] << " , " << d_A[1028] << " , " << d_A[1029] << "\n";


        cudaMemcpy(A, d_A, N * N * sizeof(float), cudaMemcpyDeviceToHost);
        std::cout << "Wrote Matrix to host memory";
        std::cout << A[0] << " , " << A[1] << " , " << A[2] << " , " << A[3] << " , " << A[4] << " , " << A[5] << "\n";
        std::cout << A[1024] << " , " << A[1025] << " , " << A[1026] << " , " << A[1027] << " , " << A[1028] << " , " << A[1029] << "\n";

        return 0;
    }

2 个答案:

答案 0 :(得分:2)

您的代码有一些问题,如果这是您进入Cuda和C ++的第一步,那么我将进一步简化代码。试试看(重要的更改被评论星包围);

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>

const int Grids = 256;
const int Threads = 256;

__global__ void matrix(float *d_A)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    d_A[idx] = 1.0;
}

int main()
{
    std::cout << "Programm begins";

    // ****
    float *A = new float[Grids * Threads];
    float *d_A;
    // ****

    cudaMalloc((void**)&d_A, (Grids * Threads)*sizeof(float));

    matrix<<<Grids, Threads>>>(d_A);

    cudaMemcpy(A, d_A, Grids * Threads*sizeof(float), cudaMemcpyDeviceToHost);

    for(int i=0; i < (Grids * Threads); ++i)
    {
         cout << A[i] << ",";
    }

    // ****
    cudaFree(d_A);
    delete A;
    // ****  

    return 0;
}

也可以在这里查看基本示例https://devblogs.nvidia.com/easy-introduction-cuda-c-and-c/

这里有几个问题;

1)您正在分配的N * N(1024 * 1024)主机内存很大,可能无法从堆中使用

2)声明d_A变量时,还声明了主机内存以及它的设备内存,这不是必需的

3)您没有为d_A释放设备内存

4)您的设备/ GPU可能无法一次运行1024个线程;在这种情况下,它可能会静默地失败,并且您最终将无法运行内核。

答案 1 :(得分:2)

您提供的代码存在一些问题。

  1. 从主机取消引用设备内存,例如d_A[0]是非法的,将导致不确定的行为。
  2. 在内核内部将数据处理为2维,而将网格和块提供为1维。在这种情况下,row变量将始终为0,并且实际上在index的计算中将不起作用。将网格和块的大小定义为dim3类型,以创建2D网格和块。
  3. 不建议在堆栈上创建大型数组,例如float A[N*N];。首选使用new运算符进行动态内存分配。
  4. 将设备内存分配给已分配的主机阵列d_A是未定义的行为。如果要为变量分配设备内存,只需将其声明为简单的指针,例如float* d_A;

固定代码如下:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
const int N = 1024;

__global__ void matrix(float *d_A)
{
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;


    int index = col + row * N;
    if (col < N && row < N)
    {
        d_A[index] = 255;
    }
}
int main()
{
    std::cout << "Programm begins"<<std::endl;
    float *A = new float[N*N];
    float *d_A;

    cudaMalloc((void**)&d_A, (N * N)*sizeof(float));
    std::cout << "Matrizes allocated"<<std::endl;
    std::cout << A[0] << " , " << A[1] << " , " << A[2] << " , " << A[3] << " , " << A[4] << " , " << A[5] <<std::endl;
    std::cout << A[1024] << " , " << A[1025] << " , " << A[1026] << " , " << A[1027] << " , " << A[1028] << " , " << A[1029] <<std::endl;

    dim3 block(32,32);
    dim3 grid;
    grid.x = (N + block.x - 1) / block.x;
    grid.y = (N + block.y - 1) / block.y;

    matrix << <grid, block >> >(d_A);
    std::cout << "Wrote Matrix to local device memory"<<std::endl;

    cudaMemcpy(A, d_A, N * N * sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << "Wrote Matrix to host memory"<<std::endl;
    std::cout << A[0] << " , " << A[1] << " , " << A[2] << " , " << A[3] << " , " << A[4] << " , " << A[5] <<std::endl;
    std::cout << A[1024] << " , " << A[1025] << " , " << A[1026] << " , " << A[1027] << " , " << A[1028] << " , " << A[1029] <<std::endl;

    cudaFree(d_A);
    delete[] A;

    return 0;
}

强烈建议每次CUDA API调用add error checking,以简化调试过程。