Question

今天我尝试创建一个使用GPU复制图像的程序。我创建了一个简单的程序来执行此操作。要加载图像我正在使用lodepng。问题不在于通过cudaMemcpy进行复制，因为当我将图像复制到GPU并将其复制回来时，它保持不变，但是当我尝试在内核中复制它时，它并没有。如果您对我的问题有任何疑问，请随时提出。

代码：

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <Windows.h>
#include <math.h>
#include <LodePNG\lodepng.h>

const int BLOCK_WIDTH = 32;


using namespace std;

__global__ void expousure(unsigned char *in, unsigned char *out)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int pitch = blockDim.x * gridDim.x;
    int absIdx = x + y * pitch;

    out[absIdx] = in[absIdx];

}

void decode(std::vector<unsigned char>& image, const char* filename, int& width, int& height)
{
    unsigned widthU, heightU;
        //decode
    unsigned error = lodepng::decode(image, widthU, heightU, filename);

    width = int(widthU);
    height = int(heightU);

    //if there's an error, display it
    if (error) std::cout << "decoder error " << error << ": " << lodepng_error_text(error) << std::endl;

    //the pixels are now in the vector "image", 4 bytes per pixel, ordered RGBARGBA..., use it as texture, draw it, ...
}

void encodeAndSave(const std::vector<unsigned char>& inPixels, const char* filename, int width, int height)
{
    std::vector<unsigned char> outEncoded;

    unsigned error = lodepng::encode(outEncoded, inPixels, unsigned(width), unsigned(height));

    if (error){

        std::cout << "encoder error" << error << ": " << lodepng_error_text(error) << std::endl;

        return;
    }

    lodepng::save_file(outEncoded, filename);
}

void encodeAndSave(unsigned char* inPixels, const char* filename, int width, int height)
{
    std::vector<unsigned char> outEncoded;

    unsigned error = lodepng::encode(outEncoded, inPixels, unsigned(width), unsigned(height));

    if (error){

        std::cout << "encoder error" << error << ": " << lodepng_error_text(error) << std::endl;

        return;
    }

    lodepng::save_file(outEncoded, filename);
}


int main(int argc, char *argv[])
{

    // decode the image to image from filename
    int width, height;
    const char* filename = argc > 1 ? argv[1] : "C:/Users/Russell/Documents/Visual Studio 2013/Projects/Hello CUDA/Release/test.png";
    vector <unsigned char> h_image;
    decode(h_image, filename, width, height);

    unsigned char *d_in;
    unsigned char *d_out;

    cudaMalloc(&d_in, sizeof(unsigned char) * width * height * 4);
    cudaMalloc(&d_out, sizeof(unsigned char) * width * height * 4);

    cudaMemcpy(d_in, &h_image[0], sizeof(unsigned char) * width * height * 4, cudaMemcpyHostToDevice);

    expousure<<<dim3(width / BLOCK_WIDTH, height / BLOCK_WIDTH, 1), dim3(BLOCK_WIDTH, BLOCK_WIDTH, 1) >>>(d_in, d_out);

    unsigned char h_out[256 * 256 * 4];

    cudaMemcpy(h_out, d_out, sizeof(unsigned char) * width * height * 4, cudaMemcpyDeviceToHost);

    // encode and save image from image to filename
    vector <unsigned char> imageOUT;
    const char* outname = "C:/Users/Russell/Documents/Visual Studio 2013/Projects/Hello CUDA/Release/testOUT.png";
    encodeAndSave(h_out, outname, width, height);

}

输入图片：http://i.stack.imgur.com/Rx0mF.png

输出图像：http://i.stack.imgur.com/HLmPQ.png

Answer 1

我想指出一些事情：

你遇到的主要问题是你的线程数组的大小是每个像素提供1个线程，但由于每个像素由4个字节组成，而你的内核每个线程只复制一个字节，你只得到1 / 4复印的图像。简而言之，对此的修复可以在x维度中启动4倍的线程，每个像素占4个字节。
每当您遇到CUDA代码时遇到问题，proper cuda error checking都是个好主意，虽然我不认为它会在这里出现任何问题。作为快速检查，您还可以使用cuda-memcheck运行CUDA代码。
如果您提供适当的MCVE，情况会更好。这样的完整的代码不依赖于外部事物，例如lodepng。
您编写的代码（带或不带上述修复）将取决于图像尺寸可被BLOCK_WIDTH整除。编写没有这些依赖关系的代码是一个好主意，修改涉及确保在内核启动时在两个维度上启动足够或超过足够的线程，然后包含＆＃34 ;线程检查＆＃34;在你的内核中确保只有有效的线程正在做任何工作（在这种情况下是复制）。

这是一个完整的例子，它不依赖于lodepng，但展示了上面第1,3和4项的合适修复。

#include <iostream>
#include <vector>

const int BLOCK_WIDTH = 32;

#define DUMMY_SIZE 256

unsigned create_dummy_image(std::vector<unsigned char>& image, unsigned & widthU, unsigned &heightU, const char* filename){

  for (int i = 0; i < 4*DUMMY_SIZE; i++)
    for (int j = 0; j < DUMMY_SIZE; j++)
      image.push_back(j%8);
  widthU = DUMMY_SIZE;
  heightU = DUMMY_SIZE;
  return 0;
}

unsigned dummy_encode(std::vector<unsigned char> &outEncoded, unsigned char *inPixels, unsigned width, unsigned height){

  for (int j = 0; j < height; j++)
    for (int i = 0 ; i < 4*width; i++)
      outEncoded.push_back(inPixels[(j*4*width)+i]);
  return 0;
}

void dummy_save(std::vector<unsigned char> &outEncoded, const char * filename){

  for (int i = 0; i < outEncoded.size(); i++)
    if (outEncoded[i] != (i%8)) {printf("mismatch at %d, was %d, should be %d\n", i, outEncoded[i], i%8); exit(1);}
}

using namespace std;

__global__ void expousure(unsigned char *in, unsigned char *out, const int width, const int height)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int pitch = width*4;
    int absIdx = x + y * pitch;
    if ((x<(width*4)) && (y<height))
      out[absIdx] = in[absIdx];

}


void decode(std::vector<unsigned char>& image, const char* filename, int& width, int& height)
{
    unsigned widthU, heightU;
        //decode
    unsigned error = create_dummy_image(image, widthU, heightU, filename);

    width = int(widthU);
    height = int(heightU);

    //if there's an error, display it
    if (error) std::cout << "decoder error " << error << ": " << error  << std::endl;

    //the pixels are now in the vector "image", 4 bytes per pixel, ordered RGBARGBA..., use it as texture, draw it, ...
}

void encodeAndSave(unsigned char* inPixels, const char* filename, int width, int height)
{
    std::vector<unsigned char> outEncoded;

    unsigned error = dummy_encode(outEncoded, inPixels, unsigned(width), unsigned(height));

    if (error){

        std::cout << "encoder error" << error << ": " << error  << std::endl;

        return;
    }

    dummy_save(outEncoded, filename);
}


int main(int argc, char *argv[])
{

    // decode the image to image from filename
    int width, height;
    const char* filename = argc > 1 ? argv[1] : "C:/Users/Russell/Documents/Visual Studio 2013/Projects/Hello CUDA/Release/test.png";
    std::vector<unsigned char> h_image;
    decode(h_image, filename, width, height);

    unsigned char *d_in;
    unsigned char *d_out;

    cudaMalloc(&d_in, sizeof(unsigned char) * width * height * 4);
    cudaMalloc(&d_out, sizeof(unsigned char) * width * height * 4);

    cudaMemcpy(d_in, &h_image[0], sizeof(unsigned char) * width * height * 4, cudaMemcpyHostToDevice);

    expousure<<<dim3((4*width / BLOCK_WIDTH)+1, (height / BLOCK_WIDTH)+1, 1), dim3(BLOCK_WIDTH, BLOCK_WIDTH, 1) >>>(d_in, d_out, width, height);

    unsigned char h_out[DUMMY_SIZE * DUMMY_SIZE * 4];

    cudaMemcpy(h_out, d_out, sizeof(unsigned char) * width * height * 4, cudaMemcpyDeviceToHost);

    // encode and save image from image to filename
    vector <unsigned char> imageOUT;
    const char* outname = "C:/Users/Russell/Documents/Visual Studio 2013/Projects/Hello CUDA/Release/testOUT.png";
    encodeAndSave(h_out, outname, width, height);
    std::cout << "Success!" << std::endl;
}

CUDA复制图像只复制部分图像

1 个答案: