如何使用cudaMalloc3D使用Cuda Memory 3D

时间:2016-03-21 08:00:24

标签: c cuda

我正在学习cuda中的3D数组操作。 我实现了以下代码,但是我无法获得预期的结果。那就是我正在接受数组并将元素从0更改为1.我试图找到错误,但我无法找到它。有人可以指出我的错误在代码中。

 int iDivUp(int a, int b) {
  return ((a % b) != 0) ? (a / b + 1) : (a / b);
}

__global__
void kernel(cudaPitchedPtr d_pitched_ptr, int COLS, int ROWS, int D) {
    int t_idx = threadIdx.x + blockIdx.x * blockDim.x;
    int t_idy = threadIdx.y + blockIdx.y * blockDim.y;

    char* d_ptr = static_cast<char*>(d_pitched_ptr.ptr);
    size_t pitch = d_pitched_ptr.pitch;

    float* element  = (float*)(d_ptr + t_idy * pitch) + t_idx;
    element[0] = 1;
    element[1] = 1;
    element[2] = 1;
}

void iFilter() {
    const int ROWS = 100
    const int COLS = 120
    const int DEPTH = 3;
    int pixels[COLS][ROWS][DEPTH];

    for (int j = 0; j < ROWS; j++) {
        for (int i = 0; i < COLS; i++) {
            for (int k = 0; k < DEPTH; k++) {
                pixels[i][j][k] = 0;
            }
        }
    }
    cudaExtent extent = make_cudaExtent(COLS * sizeof(int), ROWS, DEPTH);
    cudaPitchedPtr d_pitched_ptr;
    cudaMalloc3D(&d_pitched_ptr, extent);

    cudaMemcpy3DParms d_parms = {0};
    d_parms.srcPtr.ptr = pixels;
    d_parms.srcPtr.pitch = COLS * sizeof(int);
    d_parms.srcPtr.xsize = COLS;
    d_parms.srcPtr.ysize = ROWS;

    d_parms.dstPtr.ptr = d_pitched_ptr.ptr;
    d_parms.dstPtr.pitch = d_pitched_ptr.pitch;
    d_parms.dstPtr.xsize = COLS;
    d_parms.dstPtr.ysize = ROWS;

    d_parms.extent.width = COLS * sizeof(int);
    d_parms.extent.height = ROWS;
    d_parms.extent.depth = DEPTH;
    d_parms.kind = cudaMemcpyHostToDevice;

    cudaMemcpy3D(&d_parms);

    dim3 block_size(blocksize, blocksize);
    dim3 grid_size(iDivUp(COLS, blocksize), iDivUp(ROWS, blocksize));

    kernel<<<grid_size, block_size>>>(
        d_pitched_ptr, COLS, ROWS, DEPTH);

    int download_pixels[COLS][ROWS][DEPTH];
    d_parms.srcPtr.ptr = d_pitched_ptr.ptr;
    d_parms.srcPtr.pitch = d_pitched_ptr.pitch;
    d_parms.dstPtr.ptr = download_pixels;
    d_parms.dstPtr.pitch = COLS * sizeof(int);
    d_parms.kind = cudaMemcpyDeviceToHost;

    cudaMemcpy3D(&d_parms);

    for (int j = 0; j < ROWS; j++) {
        for (int i = 0; i < COLS; i++) {
            for (int k = 0; k < DEPTH; k++) {
                image.at<cv::Vec3b>(j, i)[k] = download_pixels[i][j][k];
                 std::cout << download_pixels[i][j][k]  << " ";
            }
             std::cout << "\n";
        }
    }
}

输出: 我得到全0而不是1

1 个答案:

答案 0 :(得分:3)

我在你的代码中找到的唯一错误就是你将一个int数组传递给了内核,但是在内核中你正在操作它,好像它是一个float数组:

float* element  = (float*)(d_ptr + t_idy * pitch) + t_idx;

当我修复该问题时,我从您创建的测试代码运行时没有错误,在最终测试中生成所有1个值:

$ cat t1114.cu
#include <iostream>
const int blocksize = 16;
 int iDivUp(int a, int b) {
  return ((a % b) != 0) ? (a / b + 1) : (a / b);
}

__global__
void kernel(cudaPitchedPtr d_pitched_ptr, int COLS, int ROWS, int D) {
    int t_idx = threadIdx.x + blockIdx.x * blockDim.x;
    int t_idy = threadIdx.y + blockIdx.y * blockDim.y;

    char* d_ptr = static_cast<char*>(d_pitched_ptr.ptr);
    size_t pitch = d_pitched_ptr.pitch;

    int * element  = (int *)(d_ptr + t_idy * pitch) + t_idx;
    element[0] = 1;
    element[1] = 1;
    element[2] = 1;
}

int main() {
    const int ROWS = 100;
    const int COLS = 120;
    const int DEPTH = 3;
    int pixels[COLS][ROWS][DEPTH];

    for (int j = 0; j < ROWS; j++) {
        for (int i = 0; i < COLS; i++) {
            for (int k = 0; k < DEPTH; k++) {
                pixels[i][j][k] = 0;
            }
        }
    }
    cudaExtent extent = make_cudaExtent(COLS * sizeof(int), ROWS, DEPTH);
    cudaPitchedPtr d_pitched_ptr;
    cudaMalloc3D(&d_pitched_ptr, extent);

    cudaMemcpy3DParms d_parms = {0};
    d_parms.srcPtr.ptr = pixels;
    d_parms.srcPtr.pitch = COLS * sizeof(int);
    d_parms.srcPtr.xsize = COLS;
    d_parms.srcPtr.ysize = ROWS;

    d_parms.dstPtr.ptr = d_pitched_ptr.ptr;
    d_parms.dstPtr.pitch = d_pitched_ptr.pitch;
    d_parms.dstPtr.xsize = COLS;
    d_parms.dstPtr.ysize = ROWS;

    d_parms.extent.width = COLS * sizeof(int);
    d_parms.extent.height = ROWS;
    d_parms.extent.depth = DEPTH;
    d_parms.kind = cudaMemcpyHostToDevice;

    cudaMemcpy3D(&d_parms);

    dim3 block_size(blocksize, blocksize);
    dim3 grid_size(iDivUp(COLS, blocksize), iDivUp(ROWS, blocksize));

    kernel<<<grid_size, block_size>>>(
        d_pitched_ptr, COLS, ROWS, DEPTH);

    int download_pixels[COLS][ROWS][DEPTH];
    d_parms.srcPtr.ptr = d_pitched_ptr.ptr;
    d_parms.srcPtr.pitch = d_pitched_ptr.pitch;
    d_parms.dstPtr.ptr = download_pixels;
    d_parms.dstPtr.pitch = COLS * sizeof(int);
    d_parms.kind = cudaMemcpyDeviceToHost;

    cudaMemcpy3D(&d_parms);
    for (int j = 0; j < ROWS; j++) {
        for (int i = 0; i < COLS; i++) {
            for (int k = 0; k < DEPTH; k++) {
                if (download_pixels[i][j][k] != 1) std::cout << i << "," << j << "," << k << ": " << download_pixels[i][j][k]  << " error! " << std::endl; return 1;
            }
        }
    }
    return 0;
}
$ nvcc -o t1114 t1114.cu
$ cuda-memcheck ./t1114
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$