我正在学习CUDA并开始实现高斯滤波器。对于初学者,我尝试实现CUDA程序,它只是复制一个输入图像。
我没有获得图像的副本,而是在较小的图像上获得“混合”像素,在较大的图像上获得灰色或空白背景。
你可以帮我找到这个bug吗?随意建议任何其他改进。
标准Lenna(500×500)
山脉(1125×750)
#define subpixel unsigned char
struct Dimensions {
unsigned width;
unsigned height;
};
struct ImageVectors {
subpixel *red;
subpixel *green;
subpixel *blue;
subpixel *alpha;
};
__global__ void CopyKernel(subpixel *device_subpixelsVector, subpixel *device_subpixelsResult) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
device_subpixelsResult[index] = device_subpixelsVector[index];
}
ImageVectors CUDAGaussBlur(ImageVectors imageVectors, Dimensions dimensions) {
const int totalNumberOfSubpixels = dimensions.width * dimensions.height;
const int sizeInBytes = dimensions.width * dimensions.height * sizeof(subpixel);
const int blockSize = 128;
const int gridSize = ceil(totalNumberOfSubpixels / blockSize);
const dim3 dimBlock(blockSize);
const dim3 dimGrid(gridSize);
ImageVectors transformedImage;
transformedImage.red = new subpixel[totalNumberOfSubpixels];
transformedImage.green = new subpixel[totalNumberOfSubpixels];
transformedImage.blue = new subpixel[totalNumberOfSubpixels];
transformedImage.alpha = new subpixel[totalNumberOfSubpixels];
subpixel *device_redVector;
subpixel *device_greenVector;
subpixel *device_blueVector;
subpixel *device_alphaVector;
subpixel *device_redResultVector;
subpixel *device_greenResultVector;
subpixel *device_blueResultVector;
subpixel *device_alphaResultVector;
cudaMalloc(&device_redVector, sizeInBytes);
cudaMalloc(&device_greenVector, sizeInBytes);
cudaMalloc(&device_blueVector, sizeInBytes);
cudaMalloc(&device_alphaVector, sizeInBytes);
cudaMalloc(&device_redResultVector, sizeInBytes);
cudaMalloc(&device_greenResultVector, sizeInBytes);
cudaMalloc(&device_blueResultVector, sizeInBytes);
cudaMalloc(&device_alphaResultVector, sizeInBytes);
cudaMemcpy(device_redVector, imageVectors.red, sizeInBytes, cudaMemcpyHostToDevice);
cudaMemcpy(device_greenVector, imageVectors.green, sizeInBytes, cudaMemcpyHostToDevice);
cudaMemcpy(device_blueVector, imageVectors.blue, sizeInBytes, cudaMemcpyHostToDevice);
cudaMemcpy(device_alphaVector, imageVectors.alpha, sizeInBytes, cudaMemcpyHostToDevice);
// Eventually CopyKernel will be replaced this with Gauss filter kernel.
CopyKernel<<<dimGrid, dimBlock>>>(device_redVector, device_redResultVector);
CopyKernel<<<dimGrid, dimBlock>>>(device_greenVector, device_greenResultVector);
CopyKernel<<<dimGrid, dimBlock>>>(device_blueVector, device_blueResultVector);
CopyKernel<<<dimGrid, dimBlock>>>(device_alphaVector, device_alphaResultVector);
cudaMemcpy(transformedImage.red, device_redResultVector, sizeInBytes, cudaMemcpyDeviceToHost);
cudaMemcpy(transformedImage.green, device_greenResultVector, sizeInBytes, cudaMemcpyDeviceToHost);
cudaMemcpy(transformedImage.blue, device_blueResultVector, sizeInBytes, cudaMemcpyDeviceToHost);
cudaMemcpy(transformedImage.alpha, device_alphaResultVector, sizeInBytes, cudaMemcpyDeviceToHost);
cudaFree(device_redVector);
cudaFree(device_greenVector);
cudaFree(device_blueVector);
cudaFree(device_alphaVector);
cudaFree(device_redResultVector);
cudaFree(device_greenResultVector);
cudaFree(device_blueResultVector);
cudaFree(device_alphaResultVector);
return transformedImage;
}
每个图像矢量(即1D阵列)都填充有来自RGBA通道的单通道值。我省略了读取,转换和写入图像文件的部分。
我不怀疑在这里找到一个错误。但我会完全放弃它,因为我几乎不会百分百肯定。
enum Channel
{
R = 0,
G = 1,
B = 2,
A = 3
};
subpixel* extractChannelToVector(std::vector<subpixel> rgbaImage, Dimensions dimensions, Channel selectedChannel) {
std::vector<subpixel> vectorBuffer;
for (int row = 0; row < dimensions.height; row++) {
for (int column = 0; column < dimensions.width; column++) {
vectorBuffer.push_back(rgbaImage[4 * dimensions.width * row + 4 * column + selectedChannel]);
}
}
const int totalNumberOfSubpixels = dimensions.width * dimensions.height;
subpixel *subpixelsVector = new subpixel[totalNumberOfSubpixels];
for (int index = 0; index++ < vectorBuffer.size(); index++)
subpixelsVector[index] = vectorBuffer[index];
return subpixelsVector;
}
std::vector<subpixel> vectorsToChannels(Dimensions dimensions, subpixel *redVector, subpixel *greenVector, subpixel *blueVector, subpixel *alphaVector) {
const int totalNumberOfSubpixels = dimensions.width * dimensions.height;
std::vector<subpixel> rgbaImage;
for (int index = 0; index < totalNumberOfSubpixels; index++) {
rgbaImage.push_back(redVector[index + Channel::R]);
rgbaImage.push_back(greenVector[index + Channel::G]);
rgbaImage.push_back(blueVector[index + Channel::B]);
rgbaImage.push_back(alphaVector[index + Channel::A]);
}
return rgbaImage;
}
std::vector<subpixel> vectorsToChannels(Dimensions dimensions, ImageVectors imageVectors) {
return vectorsToChannels(dimensions, imageVectors.red, imageVectors.green, imageVectors.blue, imageVectors.alpha);
}
int main() {
const char* filename = R"(lenna.png)";
cout << filename << endl;
std::vector<subpixel> png;
std::vector<subpixel> rgbaImage;
Dimensions dimensions;
lodepng::load_file(png, filename);
lodepng::decode(rgbaImage, dimensions.width, dimensions.height, png);
cout << "sizeof(image): " << rgbaImage.size() << endl
<< "width: " << dimensions.width << endl
<< "height: " << dimensions.height << endl;
ImageVectors imageVectors;
imageVectors.red = extractChannelToVector(rgbaImage, dimensions, Channel::R);
imageVectors.green = extractChannelToVector(rgbaImage, dimensions, Channel::G);
imageVectors.blue = extractChannelToVector(rgbaImage, dimensions, Channel::B);
imageVectors.alpha = extractChannelToVector(rgbaImage, dimensions, Channel::A);
std::vector<subpixel> transformedImage = vectorsToChannels(dimensions, CUDAGaussBlur(imageVectors, dimensions));
lodepng::encode("lenna-result.png", transformedImage, dimensions.width, dimensions.height);
return 0;
}
我正在使用“lodepng”来读取和写入PNG文件。当我使用CPU进行高斯滤波时,我已经在这个程序中成功使用了它。有关lodepng的更多信息,请访问:lodev.org/lodepng/,github.com/lvandeve/lodepng。
根据@jwdmsd的建议,我有短路内核。更准确地说,我只是从主机(CPU)设备(GPU)复制图像数据,然后从设备复制到主机而不使用内核。
生成的图像为灰色,颜色为#cdcdcdcd
。有趣的是,Microsoft编译器使用0xCD
来填充调试模式下的内存块。根据{{3}} 0xCD
称为Clean Memory
,代表Allocated memory via malloc or new but never written by the application.
好像我有一些内存/指针问题。问题出在哪里?
Lenna的结果
SO::When and why will an OS initialise memory to 0xCD, 0xDD, etc. on malloc/free/new/delete?
答案 0 :(得分:3)
我认为您的问题不在于CUDA。
更改
for (int index = 0; index++ < vectorBuffer.size(); index++)
^^^^^^^
subpixelsVector[index] = vectorBuffer[index];
到
for (int index = 0; index < vectorBuffer.size(); index++)
^^^^^
subpixelsVector[index] = vectorBuffer[index];
并且还要改变
for (int index = 0; index < totalNumberOfSubpixels; index++) {
rgbaImage.push_back(redVector[index + Channel::R]);
rgbaImage.push_back(greenVector[index + Channel::G]);
rgbaImage.push_back(blueVector[index + Channel::B]);
rgbaImage.push_back(alphaVector[index + Channel::A]);
}
到
for (int index = 0; index < totalNumberOfSubpixels; index++) {
rgbaImage.push_back(redVector[index]);
rgbaImage.push_back(greenVector[index]);
rgbaImage.push_back(blueVector[index]);
rgbaImage.push_back(alphaVector[index]);
}
+)您最好按如下方式修改内核:
__global__ void CopyKernel(subpixel *device_subpixelsVector, subpixel *device_subpixelsResult, int totalNumberOfSubpixels) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < totalNumberOfSubpixels)
device_subpixelsResult[index] = device_subpixelsVector[index];
}
另外,请阅读this,尤其是“如何获得有关Stack Overflow的CUDA问题的有用答案”。它应该可以帮助你从这里得到更好的答案。