Question

我正在尝试在cuda中重新创建一种算法，并且其中一个步骤是模糊3d图像（在我的情况下为344 x 288 x 176），并且我想为其使用3x3x3高斯蒙版。
我在带有Intel i7-5500U，GeForce gt840m 4GB和8GB ram的计算机上的Visual Studio Community 2015中使用cuda 8，而现在我在这个问题上苦苦挣扎了几天。
问题是，当我的程序结束模糊函数时，似乎丢失了由指针作为模糊函数的参数传递的内存信息，它不会在函数或cudaPeekAtLastError或cudaDeviceSynchronize上崩溃，但是当我从Nsight运行程序时>启动CUDA对其进行调试工作得很好，如果我将断点放置在此函数之前并运行它，则cudaPeekAtLastError和cudaDeviceSynchronize会一步一步地运行，那么有时它有时不会起作用。
内存检查说：

==========错误：进程未成功终止
==========从主机取消引用统一内存时，应用程序可能会遇到错误。请在主机调试器下重新运行该应用程序以捕获此类错误。
=========由于对cudaDeviceSynchronize的CUDA API调用上出现“未指定的启动失败”，导致程序命中cudaErrorLaunchFailure（错误4）。

我的代码中有类似的操作，而且所有操作都工作正常，我只是看不到这有什么问题。我不是十全十美的cuda程序员，我有点c并且仍然在学习cuda，我知道我的英语也很烂：）

所以有这个功能：

__global__ void gaussBlur(float *img,
                          float *gaussKernel, 
                          int imgSize, 
                          int kernelSize, 
                          int *imgDims, 
                          int *gaussKernelDims) {

    float newVoxelValue = 0;
    int temp_idx = 0;
    int kernel_x, kernel_y, kernel_z, temp_i, indeksWartosciDoPobrania = 0;
    int img_z, img_y, img_x;

    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
    const unsigned int startIdx = (x * imgDims[1] + y) * imgDims[0];

    int start = startIdx;
    int stop = startIdx + imgDims[0];

    for (int idx = start; idx < stop; idx++) {
        if (idx < imgSize) {
            img_z = idx / (imgDims[0] * imgDims[1]);
            temp_idx = idx - (img_z * imgDims[0] * imgDims[1]);
            img_y = temp_idx / imgDims[0];
            img_x = temp_idx % imgDims[0];
            if (img_x > 1 && img_x < imgDims[0] - 2 &&
                img_y > 1 && img_y < imgDims[1] - 2 &&
                img_z > 1 && img_z < imgDims[2] - 2) {
                newVoxelValue = 0;
                for (int i = 0; i < kernelSize; i++) {
                    kernel_z = i / (gaussKernelDims[0] * gaussKernelDims[1]);
                    temp_i = i - (kernel_z * gaussKernelDims[0] * gaussKernelDims[1]);
                    kernel_y = temp_i / gaussKernelDims[0];
                    kernel_x = temp_i % gaussKernelDims[0];

                    kernel_x--;kernel_y--;kernel_z--;
                    indeksWartosciDoPobrania = ((img_z + kernel_z) * imgDims[0] * imgDims[1]) + ((img_y + kernel_y) * imgDims[0]) + (img_x + kernel_x);
                newVoxelValue += gaussKernel[i] * 
img[indeksWartosciDoPobrania];
                }
                img[idx] = newVoxelValue;
            }
        }
    }
}

这是它的电话：

inline __device__ __host__ unsigned int UMIN(unsigned int a, unsigned int b)
{
    return a < b ? a : b;
}

inline __device__ __host__ unsigned int PowTwoDivider(unsigned int n)
{
    if (n == 0) return 0;
    unsigned int divider = 1;
    while ((n & divider) == 0) divider <<= 1;
    return divider;
}
int main() {
    float *out_image;
    float *gaussKernel;

    int *gaussKernelDims;
    int *imgDims;

    cudaMallocManaged((void **)&gaussKernelDims, 3 * sizeof(*gaussKernelDims)); //lets say its hard coded {344, 288, 176}
    cudaMallocManaged((void **)&imgDims, 3 * sizeof(*imgDims));//lets say its hard coded {3, 3, 3}

    std::ifstream file("image.bin", std::ios::binary);
    if (!file.is_open()) {
        std::cout << "\n\nNie udalo sie otworzyc pliku obrazu";
        return -1;
    }
    char * memblock;
    file.seekg(0, std::ios::end);
    int sizeOfFile = file.tellg();
    memblock = new char[sizeOfFile];
    file.seekg(0, std::ios::beg);
    file.read(memblock, sizeOfFile);
    file.close();

    size = sizeOfFile / sizeof(*out_image);
    cudaMallocManaged((void **)&out_image, size * sizeof(*out_image));
    memcpy(out_image, memblock, sizeOfFile);
    free(memblock);
    //gaussKernel is loaded same way and it works for sure


    unsigned int dimX = UMIN(UMIN(PowTwoDivider(imgDims[0]), PowTwoDivider(imgDims[1])), 64);
    unsigned int dimY = UMIN(UMIN(PowTwoDivider(imgDims[2]), PowTwoDivider(imgDims[1])), 512 / dimX);
    dim3 dimBlock(dimX, dimY);
    dim3 dimGridX(imgDims[1] / dimBlock.x, imgDims[2] / dimBlock.y);

    gaussBlur<<< dimGridX, dimBlock >>>(out_image, gaussKernel, size, gaussKernelSize, imgDims, gaussKernelDims);
    cudaPeekAtLastError();
    cudaDeviceSynchronize();
    //if i try to access out_image here program will crush
}

我试图将模糊结果保存到其他变量中（其他指针以与图像相同的方式作为参数传递），但是存在相同的问题，因此我怀疑这将是内存争用，并且因为它只有3x3x3，所以我不认为是否在获取值之前或之后在另一个线程之后这样做将是有意义的。当进行测试时，我将该代码复制到cpu版本，并且可以正常工作：

__host__ void gaussBlurCPU(float *img, float *gaussKernel, int imgSize, int kernelSize, int *imgDims, int *gaussKernelDims) {

    float newVoxelValue = 0;
    int temp_idx = 0;
    int kernel_x, kernel_y, kernel_z, temp_i, indeksWartosciDoPobrania = 0;
    int img_z, img_y, img_x;
    for (int idx = 0; idx < imgSize; idx++) {
        if (idx < imgSize) {
            img_z = idx / (imgDims[0] * imgDims[1]);
            temp_idx = idx - (img_z * imgDims[0] * imgDims[1]);
            img_y = temp_idx / imgDims[0];
            img_x = temp_idx % imgDims[0];
            if (img_x > 1 && img_x < imgDims[0] - 2 &&
                img_y > 1 && img_y < imgDims[1] - 2 &&
                img_z > 1 && img_z < imgDims[2] - 2) {
                newVoxelValue = 0;
                for (int i = 0; i < kernelSize; i++) {
                    kernel_z = i / (gaussKernelDims[0] * gaussKernelDims[1]);
                    temp_i = i - (kernel_z * gaussKernelDims[0] * gaussKernelDims[1]);
                    kernel_y = temp_i / gaussKernelDims[0];
                    kernel_x = temp_i % gaussKernelDims[0];

                    kernel_x--;kernel_y--;kernel_z--;
                    indeksWartosciDoPobrania = ((img_z + kernel_z) * imgDims[0] * imgDims[1]) + ((img_y + kernel_y) * imgDims[0]) + (img_x + kernel_x);
                    newVoxelValue += gaussKernel[i] * img[indeksWartosciDoPobrania];

                }
                img[idx] = newVoxelValue;
            }
        }
    }
}

图像是来自Matlab的二进制保存文件（以c ++读取，然后进行了修改和保存，可以很好地返回到matlab中），其值在0.0f：〜900.0f范围内。 GaussKernel是来自matlab的二进制保存文件（以c ++读取，然后修改并保存为完全可以在matlab中正常工作）所有值的总和等于1。

准确地说，我在此处复制/粘贴gaussBlur（），但是main是从代码中的几个地方复制的，所以我很可能错过了一些东西

感谢您提前提出所有建议，最好的祝福， Wojciech Serafin

编辑：正如Robert Crovella所说，我评论WDDM TDR超时是我的代码如此行事的原因。抱歉打扰您，在我询问之前，我可能在这方面做的研究很少。

在cuda中进行高斯模糊3d图像处理，有时有效，有时无效

0 个答案: