Question

我只是想编写一个非常简单的例子来自Nvidia编程指南，该指南展示了如何使用cudaMalloc3D在设备上分配3D数据。

这是我正在使用的代码，编译时没有任何错误。 cuda-memcheck运行也没有错误。我使用CC 3.0的显卡。

这是代码：

#include <iostream>
#include <stdio.h>

typedef float PixelType;

__global__ void extract_patches_from_image_data(cudaPitchedPtr devicePitchedPointer,     dim3 image_dimensions)
{
    // Test
    printf("HELLO - PLEASE PRINT THIS\n");

    // Check image dimensions
    printf("Current x: %d\n", image_dimensions.x);
    printf("Current y: %d\n", image_dimensions.y);
    printf("Current z: %d\n", image_dimensions.z);

    // Get attributes from device pitched pointer
    char     *devicePointer  =   (char *)devicePitchedPointer.ptr;
    size_t    pitch          =   devicePitchedPointer.pitch;
    size_t    slicePitch     =   pitch * image_dimensions.y;

    // Loop over image data
    for(int z = 0; z < image_dimensions.z; ++z)
    {
         char *current_slice = devicePointer + z * slicePitch;


        for(int y = 0; y < image_dimensions.y; ++y)
        {
            PixelType *current_row = (PixelType *)(current_slice + y * pitch);

            for(int x = 0; x < image_dimensions.x; ++x)
            {
                PixelType current_element = current_row[x];

                printf("Current element: %d\n", current_element);
            }
        }
    }
}

int main(void)
{
    // Set up test data
    PixelType image_data[3][3][3] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26};
    dim3  image_dimensions = dim3(32, 32, 32);

    // Allocate 3D memory on the device
    cudaExtent volumeSizeBytes = make_cudaExtent(sizeof(PixelType) * image_dimensions.x, image_dimensions.y, image_dimensions.z);
    cudaPitchedPtr devicePitchedPointer;
    cudaMalloc3D(&devicePitchedPointer, volumeSizeBytes);

    // Kernel Launch Configuration
    dim3 threads_per_block = dim3(32, 32, 1);
    dim3 blocks_per_grid = dim3(32, 1, 1);
    extract_patches_from_image_data<<<blocks_per_grid, threads_per_block>>>(devicePitchedPointer, image_dimensions);
}

我现在的问题是：为什么没有输出写入控制台？似乎永远不会调用内核，但我无法弄清楚原因。我尝试使用Eclipse Nsight，并编写自己的makefile，如下所示：

main: main.cu Makefile
nvcc -arch=sm_20 -o main main.cu --ptxas-options=-v --use_fast_math --compiler-options -Wall

Answer 1

缺少电话： cudaFree(devicePitchedPointer.ptr)

为什么我需要这个才能使printf工作？.....

Answer 2

image_dimensions变量在主机内存中声明，但在内核函数中使用。您必须将此变量从主机复制到设备，以便设备功能可以正常执行。

cudaMalloc3D - 简单示例

2 个答案: