更新＃1

Question

我正在学习CUDA并开始实现高斯滤波器。对于初学者，我尝试实现CUDA程序，它只是复制一个输入图像。

我没有获得图像的副本，而是在较小的图像上获得“混合”像素，在较大的图像上获得灰色或空白背景。

你可以帮我找到这个bug吗？

随意建议任何其他改进。

输入→输出示例

标准Lenna（500×500）

山脉（1125×750）

源代码

#define subpixel unsigned char 

struct Dimensions {
    unsigned width;
    unsigned height;
};

struct ImageVectors {
    subpixel *red;
    subpixel *green;
    subpixel *blue;
    subpixel *alpha;
};

__global__ void CopyKernel(subpixel *device_subpixelsVector, subpixel *device_subpixelsResult) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    device_subpixelsResult[index] = device_subpixelsVector[index];
}

ImageVectors CUDAGaussBlur(ImageVectors imageVectors, Dimensions dimensions) {
    const int totalNumberOfSubpixels = dimensions.width * dimensions.height;
    const int sizeInBytes = dimensions.width * dimensions.height * sizeof(subpixel);

    const int blockSize = 128;
    const int gridSize = ceil(totalNumberOfSubpixels / blockSize);
    const dim3 dimBlock(blockSize);
    const dim3 dimGrid(gridSize);

    ImageVectors transformedImage;
    transformedImage.red = new subpixel[totalNumberOfSubpixels];
    transformedImage.green = new subpixel[totalNumberOfSubpixels];
    transformedImage.blue = new subpixel[totalNumberOfSubpixels];
    transformedImage.alpha = new subpixel[totalNumberOfSubpixels];

    subpixel *device_redVector;
    subpixel *device_greenVector;
    subpixel *device_blueVector;
    subpixel *device_alphaVector;

    subpixel *device_redResultVector;
    subpixel *device_greenResultVector;
    subpixel *device_blueResultVector;
    subpixel *device_alphaResultVector;

    cudaMalloc(&device_redVector, sizeInBytes);
    cudaMalloc(&device_greenVector, sizeInBytes);
    cudaMalloc(&device_blueVector, sizeInBytes);
    cudaMalloc(&device_alphaVector, sizeInBytes);

    cudaMalloc(&device_redResultVector, sizeInBytes);
    cudaMalloc(&device_greenResultVector, sizeInBytes);
    cudaMalloc(&device_blueResultVector, sizeInBytes);
    cudaMalloc(&device_alphaResultVector, sizeInBytes);

    cudaMemcpy(device_redVector, imageVectors.red, sizeInBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(device_greenVector, imageVectors.green, sizeInBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(device_blueVector, imageVectors.blue, sizeInBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(device_alphaVector, imageVectors.alpha, sizeInBytes, cudaMemcpyHostToDevice);

    // Eventually CopyKernel will be replaced this with Gauss filter kernel.
    CopyKernel<<<dimGrid, dimBlock>>>(device_redVector, device_redResultVector);
    CopyKernel<<<dimGrid, dimBlock>>>(device_greenVector, device_greenResultVector);
    CopyKernel<<<dimGrid, dimBlock>>>(device_blueVector, device_blueResultVector);
    CopyKernel<<<dimGrid, dimBlock>>>(device_alphaVector, device_alphaResultVector);

    cudaMemcpy(transformedImage.red, device_redResultVector, sizeInBytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(transformedImage.green, device_greenResultVector, sizeInBytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(transformedImage.blue, device_blueResultVector, sizeInBytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(transformedImage.alpha, device_alphaResultVector, sizeInBytes, cudaMemcpyDeviceToHost);

    cudaFree(device_redVector);
    cudaFree(device_greenVector);
    cudaFree(device_blueVector);
    cudaFree(device_alphaVector);

    cudaFree(device_redResultVector);
    cudaFree(device_greenResultVector);
    cudaFree(device_blueResultVector);
    cudaFree(device_alphaResultVector);

    return transformedImage;
}

每个图像矢量（即1D阵列）都填充有来自RGBA通道的单通道值。我省略了读取，转换和写入图像文件的部分。

图像输入/输出

我不怀疑在这里找到一个错误。但我会完全放弃它，因为我几乎不会百分百肯定。

enum Channel
{
    R = 0,
    G = 1,
    B = 2,
    A = 3
};

subpixel* extractChannelToVector(std::vector<subpixel> rgbaImage, Dimensions dimensions, Channel selectedChannel) {
    std::vector<subpixel> vectorBuffer;

    for (int row = 0; row < dimensions.height; row++) {
        for (int column = 0; column < dimensions.width; column++) {
            vectorBuffer.push_back(rgbaImage[4 * dimensions.width * row + 4 * column + selectedChannel]);
        }
    }

    const int totalNumberOfSubpixels = dimensions.width * dimensions.height;
    subpixel *subpixelsVector = new subpixel[totalNumberOfSubpixels];
    for (int index = 0; index++ < vectorBuffer.size(); index++)
        subpixelsVector[index] = vectorBuffer[index];
    return subpixelsVector;
}

std::vector<subpixel> vectorsToChannels(Dimensions dimensions, subpixel *redVector, subpixel *greenVector, subpixel *blueVector, subpixel *alphaVector) {
    const int totalNumberOfSubpixels = dimensions.width * dimensions.height;
    std::vector<subpixel> rgbaImage;
    for (int index = 0; index < totalNumberOfSubpixels; index++) {
        rgbaImage.push_back(redVector[index + Channel::R]);
        rgbaImage.push_back(greenVector[index + Channel::G]);
        rgbaImage.push_back(blueVector[index + Channel::B]);
        rgbaImage.push_back(alphaVector[index + Channel::A]);
    }
    return rgbaImage;
}

std::vector<subpixel> vectorsToChannels(Dimensions dimensions, ImageVectors imageVectors) {
    return vectorsToChannels(dimensions, imageVectors.red, imageVectors.green, imageVectors.blue, imageVectors.alpha);
}

int main() {
    const char* filename = R"(lenna.png)";
    cout << filename << endl;

    std::vector<subpixel> png;
    std::vector<subpixel> rgbaImage;
    Dimensions dimensions;

    lodepng::load_file(png, filename);
    lodepng::decode(rgbaImage, dimensions.width, dimensions.height, png);

    cout << "sizeof(image): " << rgbaImage.size() << endl
        << "width: " << dimensions.width << endl
        << "height: " << dimensions.height << endl;

    ImageVectors imageVectors;
    imageVectors.red = extractChannelToVector(rgbaImage, dimensions, Channel::R);
    imageVectors.green = extractChannelToVector(rgbaImage, dimensions, Channel::G);
    imageVectors.blue = extractChannelToVector(rgbaImage, dimensions, Channel::B);
    imageVectors.alpha = extractChannelToVector(rgbaImage, dimensions, Channel::A);

    std::vector<subpixel> transformedImage = vectorsToChannels(dimensions, CUDAGaussBlur(imageVectors, dimensions));

    lodepng::encode("lenna-result.png", transformedImage, dimensions.width, dimensions.height);

    return 0;
}

我正在使用“lodepng”来读取和写入PNG文件。当我使用CPU进行高斯滤波时，我已经在这个程序中成功使用了它。有关lodepng的更多信息，请访问：lodev.org/lodepng/，github.com/lvandeve/lodepng。

更新＃1

根据@jwdmsd的建议，我有短路内核。更准确地说，我只是从主机（CPU）设备（GPU）复制图像数据，然后从设备复制到主机而不使用内核。

生成的图像为灰色，颜色为#cdcdcdcd。有趣的是，Microsoft编译器使用0xCD来填充调试模式下的内存块。根据{{3}} 0xCD称为Clean Memory，代表Allocated memory via malloc or new but never written by the application. 好像我有一些内存/指针问题。问题出在哪里？

Lenna的结果

SO::When and why will an OS initialise memory to 0xCD, 0xDD, etc. on malloc/free/new/delete?

Answer 1

我认为您的问题不在于CUDA。

更改

for (int index = 0; index++ < vectorBuffer.size(); index++)
                    ^^^^^^^
    subpixelsVector[index] = vectorBuffer[index];

到

for (int index = 0; index < vectorBuffer.size(); index++)
                    ^^^^^
    subpixelsVector[index] = vectorBuffer[index];

并且还要改变

for (int index = 0; index < totalNumberOfSubpixels; index++) {
    rgbaImage.push_back(redVector[index + Channel::R]);
    rgbaImage.push_back(greenVector[index + Channel::G]);
    rgbaImage.push_back(blueVector[index + Channel::B]);
    rgbaImage.push_back(alphaVector[index + Channel::A]);
}

到

for (int index = 0; index < totalNumberOfSubpixels; index++) {
    rgbaImage.push_back(redVector[index]);
    rgbaImage.push_back(greenVector[index]);
    rgbaImage.push_back(blueVector[index]);
    rgbaImage.push_back(alphaVector[index]);
}

+）您最好按如下方式修改内核：

__global__ void CopyKernel(subpixel *device_subpixelsVector, subpixel *device_subpixelsResult, int totalNumberOfSubpixels) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < totalNumberOfSubpixels)
        device_subpixelsResult[index] = device_subpixelsVector[index];
}

另外，请阅读this，尤其是“如何获得有关Stack Overflow的CUDA问题的有用答案”。它应该可以帮助你从这里得到更好的答案。

CUDA输出中的混合像素

输入→输出示例

源代码

图像输入/输出

更新＃1

1 个答案: