
时间:2018-10-14 20:47:08

标签: c++ cuda

我正在尝试在cuda中重新创建一种算法,并且其中一个步骤是模糊3d图像(在我的情况下为344 x 288 x 176),并且我想为其使用3x3x3高斯蒙版。
我在带有Intel i7-5500U,GeForce gt840m 4GB和8GB ram的计算机上的Visual Studio Community 2015中使用cuda 8,而现在我在这个问题上苦苦挣扎了几天。

=========由于对cudaDeviceSynchronize的CUDA API调用上出现“未指定的启动失败”,导致程序命中cudaErrorLaunchFailure(错误4)。



__global__ void gaussBlur(float *img,
                          float *gaussKernel, 
                          int imgSize, 
                          int kernelSize, 
                          int *imgDims, 
                          int *gaussKernelDims) {

    float newVoxelValue = 0;
    int temp_idx = 0;
    int kernel_x, kernel_y, kernel_z, temp_i, indeksWartosciDoPobrania = 0;
    int img_z, img_y, img_x;

    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
    const unsigned int startIdx = (x * imgDims[1] + y) * imgDims[0];

    int start = startIdx;
    int stop = startIdx + imgDims[0];

    for (int idx = start; idx < stop; idx++) {
        if (idx < imgSize) {
            img_z = idx / (imgDims[0] * imgDims[1]);
            temp_idx = idx - (img_z * imgDims[0] * imgDims[1]);
            img_y = temp_idx / imgDims[0];
            img_x = temp_idx % imgDims[0];
            if (img_x > 1 && img_x < imgDims[0] - 2 &&
                img_y > 1 && img_y < imgDims[1] - 2 &&
                img_z > 1 && img_z < imgDims[2] - 2) {
                newVoxelValue = 0;
                for (int i = 0; i < kernelSize; i++) {
                    kernel_z = i / (gaussKernelDims[0] * gaussKernelDims[1]);
                    temp_i = i - (kernel_z * gaussKernelDims[0] * gaussKernelDims[1]);
                    kernel_y = temp_i / gaussKernelDims[0];
                    kernel_x = temp_i % gaussKernelDims[0];

                    indeksWartosciDoPobrania = ((img_z + kernel_z) * imgDims[0] * imgDims[1]) + ((img_y + kernel_y) * imgDims[0]) + (img_x + kernel_x);
                newVoxelValue += gaussKernel[i] * 
                img[idx] = newVoxelValue;


inline __device__ __host__ unsigned int UMIN(unsigned int a, unsigned int b)
    return a < b ? a : b;

inline __device__ __host__ unsigned int PowTwoDivider(unsigned int n)
    if (n == 0) return 0;
    unsigned int divider = 1;
    while ((n & divider) == 0) divider <<= 1;
    return divider;
int main() {
    float *out_image;
    float *gaussKernel;

    int *gaussKernelDims;
    int *imgDims;

    cudaMallocManaged((void **)&gaussKernelDims, 3 * sizeof(*gaussKernelDims)); //lets say its hard coded {344, 288, 176}
    cudaMallocManaged((void **)&imgDims, 3 * sizeof(*imgDims));//lets say its hard coded {3, 3, 3}

    std::ifstream file("image.bin", std::ios::binary);
    if (!file.is_open()) {
        std::cout << "\n\nNie udalo sie otworzyc pliku obrazu";
        return -1;
    char * memblock;
    file.seekg(0, std::ios::end);
    int sizeOfFile = file.tellg();
    memblock = new char[sizeOfFile];
    file.seekg(0, std::ios::beg);
    file.read(memblock, sizeOfFile);

    size = sizeOfFile / sizeof(*out_image);
    cudaMallocManaged((void **)&out_image, size * sizeof(*out_image));
    memcpy(out_image, memblock, sizeOfFile);
    //gaussKernel is loaded same way and it works for sure

    unsigned int dimX = UMIN(UMIN(PowTwoDivider(imgDims[0]), PowTwoDivider(imgDims[1])), 64);
    unsigned int dimY = UMIN(UMIN(PowTwoDivider(imgDims[2]), PowTwoDivider(imgDims[1])), 512 / dimX);
    dim3 dimBlock(dimX, dimY);
    dim3 dimGridX(imgDims[1] / dimBlock.x, imgDims[2] / dimBlock.y);

    gaussBlur<<< dimGridX, dimBlock >>>(out_image, gaussKernel, size, gaussKernelSize, imgDims, gaussKernelDims);
    //if i try to access out_image here program will crush


__host__ void gaussBlurCPU(float *img, float *gaussKernel, int imgSize, int kernelSize, int *imgDims, int *gaussKernelDims) {

    float newVoxelValue = 0;
    int temp_idx = 0;
    int kernel_x, kernel_y, kernel_z, temp_i, indeksWartosciDoPobrania = 0;
    int img_z, img_y, img_x;
    for (int idx = 0; idx < imgSize; idx++) {
        if (idx < imgSize) {
            img_z = idx / (imgDims[0] * imgDims[1]);
            temp_idx = idx - (img_z * imgDims[0] * imgDims[1]);
            img_y = temp_idx / imgDims[0];
            img_x = temp_idx % imgDims[0];
            if (img_x > 1 && img_x < imgDims[0] - 2 &&
                img_y > 1 && img_y < imgDims[1] - 2 &&
                img_z > 1 && img_z < imgDims[2] - 2) {
                newVoxelValue = 0;
                for (int i = 0; i < kernelSize; i++) {
                    kernel_z = i / (gaussKernelDims[0] * gaussKernelDims[1]);
                    temp_i = i - (kernel_z * gaussKernelDims[0] * gaussKernelDims[1]);
                    kernel_y = temp_i / gaussKernelDims[0];
                    kernel_x = temp_i % gaussKernelDims[0];

                    indeksWartosciDoPobrania = ((img_z + kernel_z) * imgDims[0] * imgDims[1]) + ((img_y + kernel_y) * imgDims[0]) + (img_x + kernel_x);
                    newVoxelValue += gaussKernel[i] * img[indeksWartosciDoPobrania];

                img[idx] = newVoxelValue;

图像是来自Matlab的二进制保存文件(以c ++读取,然后进行了修改和保存,可以很好地返回到matlab中),其值在0.0f:〜900.0f范围内。 GaussKernel是来自matlab的二进制保存文件(以c ++读取,然后修改并保存为完全可以在matlab中正常工作)所有值的总和等于1。


感谢您提前提出所有建议, 最好的祝福, Wojciech Serafin

编辑: 正如Robert Crovella所说,我评论WDDM TDR超时是我的代码如此行事的原因。抱歉打扰您,在我询问之前,我可能在这方面做的研究很少。

0 个答案:
