如果有人问这个问题我道歉,请将我链接到主题!
无论如何我是CUDA的新手(我来自OpenCL)并想尝试使用它生成图像。相关的CUDA代码是:
__global__
void mandlebrot(uint8_t *pixels, size_t pitch, unsigned long width, unsigned long height) {
unsigned block_size = blockDim.x;
uint2 location = {blockIdx.x*block_size, blockIdx.y*block_size};
ulong2 pixel_location = {threadIdx.x, threadIdx.y};
ulong2 real_location = {location.x + pixel_location.x, location.y + pixel_location.y};
if (real_location.x >= width || real_location.y >= height)
return;
uint8_t *row = (uint8_t *)((char *)pixels + real_location.y * pitch);
row[real_location.x * 4+0] = 0;
row[real_location.x * 4+1] = 255;
row[real_location.x * 4+2] = 0;
row[real_location.x * 4+3] = 255;
}
cudaError_t err = cudaSuccess;
#define CUDA_ERR(e) \
if ((err = e) != cudaSuccess) { \
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); \
exit(-1); \
}
int main(void) {
ulong2 dims = {1000, 1000};
unsigned long block_size = 500;
dim3 threads_per_block(block_size, block_size);
dim3 remainders(dims.x % threads_per_block.x, dims.y % threads_per_block.y);
dim3 blocks(dims.x / threads_per_block.x + (remainders.x == 0 ? 0 : 1), dims.y / threads_per_block.y + (remainders.y == 0 ? 0 : 1));
size_t pitch;
uint8_t *pixels, *h_pixels = NULL;
CUDA_ERR(cudaMallocPitch(&pixels, &pitch, dims.x * 4 * sizeof(uint8_t), dims.y));
mandlebrot<<<blocks, threads_per_block>>>(pixels, pitch, dims.x, dims.y);
h_pixels = (uint8_t *)malloc(dims.x * 4 * sizeof(uint8_t) * dims.y);
memset(h_pixels, 0, dims.x * 4 * sizeof(uint8_t) * dims.y);
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x, dims.y, cudaMemcpyDeviceToHost));
save_png("out.png", h_pixels, dims.x, dims.y);
CUDA_ERR(cudaFree(pixels));
free(h_pixels);
CUDA_ERR(cudaDeviceReset());
puts("Success");
return 0;
}
save_png
函数是我创建的常用实用函数,用于获取数据块并将其保存到png:
void save_png(const char *filename, uint8_t *buffer, unsigned long width, unsigned long height) {
png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
if (!png_ptr) {
std::cerr << "Failed to create png write struct" << std::endl;
return;
}
png_infop info_ptr = png_create_info_struct(png_ptr);
if (!info_ptr) {
std::cerr << "Failed to create info_ptr" << std::endl;
png_destroy_write_struct(&png_ptr, NULL);
return;
}
FILE *fp = fopen(filename, "wb");
if (!fp) {
std::cerr << "Failed to open " << filename << " for writing" << std::endl;
png_destroy_write_struct(&png_ptr, &info_ptr);
return;
}
if (setjmp(png_jmpbuf(png_ptr))) {
png_destroy_write_struct(&png_ptr, &info_ptr);
std::cerr << "Error from libpng!" << std::endl;
return;
}
png_init_io(png_ptr, fp);
png_set_IHDR(png_ptr, info_ptr, width, height, 8, PNG_COLOR_TYPE_RGBA, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
png_write_info(png_ptr, info_ptr);
png_byte *row_pnts[height];
size_t i;
for (i = 0; i < height; i++) {
row_pnts[i] = buffer + width * 4 * i;
}
png_write_image(png_ptr, row_pnts);
png_write_end(png_ptr, info_ptr);
png_destroy_write_struct(&png_ptr, &info_ptr);
fclose(fp);
}
无论如何,生成的图像是一个奇怪的白色条纹,带有随机彩色像素,可以看到here。
有什么明显我做错了吗?我试着按照CUDA网站上的介绍文档进行操作。否则任何人都可以帮我解决这个问题吗?在这里,我只是尝试用绿色像素填充pixels
缓冲区。
我正在使用带有NVIDIA GeForce GT 650M独立显卡的MBP视网膜。如果需要,我可以从cuda示例代码中将输出运行并粘贴到print_devices
。
编辑:使用以下makefile编译时注意没有错误或警告:
all:
nvcc -c mandlebrot.cu -o mandlebrot.cu.o
nvcc mandlebrot.cu.o -o mandlebrot -lpng
并且在运行时没有错误。
答案 0 :(得分:1)
最好是提供一个完整的代码,以便有人可以复制,粘贴,编译和运行,而无需添加任何内容或更改任何内容。在我看来,剥离包含标头对我们来说并没有帮助,并使您的测试代码依赖如果你需要帮助,在其他人可能没有的png库上也没有效率。
您对内核启动的错误检查已损坏。您可以查看proper cuda error checking。如果您有正确的错误检查,或者使用cuda-memcheck
运行代码,您将在内核启动时发现错误9。这是一个无效的配置。如果您打印出blocks
和threads_per_block
个变量,就会看到如下内容:
blocks: 2, 2
threads: 500, 500
您实际上在每个块中将线程设置为500,500:
unsigned long block_size = 500;
dim3 threads_per_block(block_size, block_size);
这是非法的,因为您要求每个块500x500个线程(即250000个线程)超过the maximum limit of 1024 threads per block。
所以你的内核根本没有运行而且你正在垃圾。
您可以通过更改block_size
定义来简单地修复此错误:
unsigned long block_size = 16;
之后仍有问题,因为你误解了cudaMemcpy2D的参数。:
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x, dims.y, cudaMemcpyDeviceToHost));
第五个参数的文档说明:
width - 矩阵传输的宽度(以字节为单位的列)
但是你已经传递了元素(4个字节的组)而不是字节的宽度。
这将解决这个问题:
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x*4, dims.y, cudaMemcpyDeviceToHost));
通过上述更改,我可以使用代码的测试版本获得良好的结果:
#include <stdio.h>
#include <stdint.h>
__global__
void mandlebrot(uint8_t *pixels, size_t pitch, unsigned long width, unsigned long height) {
unsigned block_size = blockDim.x;
uint2 location = {blockIdx.x*block_size, blockIdx.y*block_size};
ulong2 pixel_location = {threadIdx.x, threadIdx.y};
ulong2 real_location = {location.x + pixel_location.x, location.y + pixel_location.y};
if (real_location.x >= width || real_location.y >= height)
return;
uint8_t *row = (uint8_t *)((char *)pixels + real_location.y * pitch);
row[real_location.x * 4+0] = 0;
row[real_location.x * 4+1] = 255;
row[real_location.x * 4+2] = 0;
row[real_location.x * 4+3] = 255;
}
cudaError_t err = cudaSuccess;
#define CUDA_ERR(e) \
if ((err = e) != cudaSuccess) { \
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); \
exit(-1); \
}
int main(void) {
ulong2 dims = {1000, 1000};
dim3 threads_per_block(16, 16);
dim3 remainders(dims.x % threads_per_block.x, dims.y % threads_per_block.y);
dim3 blocks(dims.x / threads_per_block.x + (remainders.x == 0 ? 0 : 1), dims.y / threads_per_block.y + (remainders.y == 0 ? 0 : 1));
size_t pitch;
uint8_t *pixels, *h_pixels = NULL;
CUDA_ERR(cudaMallocPitch(&pixels, &pitch, dims.x * 4 * sizeof(uint8_t), dims.y));
printf("blocks: %u, %u\n", blocks.x, blocks.y);
printf("threads: %u, %u\n", threads_per_block.x, threads_per_block.y);
mandlebrot<<<blocks, threads_per_block>>>(pixels, pitch, dims.x, dims.y);
h_pixels = (uint8_t *)malloc(dims.x * 4 * sizeof(uint8_t) * dims.y);
memset(h_pixels, 0, dims.x * 4 * sizeof(uint8_t) * dims.y);
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x*4, dims.y, cudaMemcpyDeviceToHost));
// save_png("out.png", h_pixels, dims.x, dims.y);
for (int row = 0; row < dims.y; row++)
for (int col = 0; col < dims.x; col++){
if (h_pixels[(row*dims.x*4) + col*4 ] != 0) {printf("mismatch 0 at %u,%u: was: %u should be: %u\n", row,col, h_pixels[(row*dims.x)+col*4], 0); return 1;}
if (h_pixels[(row*dims.x*4) + col*4 +1] != 255) {printf("mismatch 1 at %u,%u: was: %u should be: %u\n", row,col, h_pixels[(row*dims.x)+col*4 +1], 255); return 1;}
if (h_pixels[(row*dims.x*4) + col*4 +2] != 0) {printf("mismatch 2: was: %u should be: %u\n", h_pixels[(row*dims.x)+col*4 +2], 0); return 1;}
if (h_pixels[(row*dims.x*4) + col*4 +3] != 255) {printf("mismatch 3: was: %u should be: %u\n", h_pixels[(row*dims.x)+col*4 +3 ], 255); return 1;}
}
CUDA_ERR(cudaFree(pixels));
free(h_pixels);
CUDA_ERR(cudaDeviceReset());
puts("Success");
return 0;
}
请注意,上面的代码是一个完整的代码,您可以复制,粘贴,编译和运行。