我正在尝试存储图像,这是由CUDA-OpenGL互操作示例在' CUDA-By Example'中生成的。教科书,进入可以存储到图像的内存缓冲区。
我想存储两张图片,一张是绿色的" X"另一个是内存缓冲区中的橙色" X"当我使用OpenGL渲染pBuffer时,我应该得到一个绿色的" X"然而,像示例输出的图像,我只是得到一个黑屏。我不确定为什么我没有得到正确的输出。有人可以告诉我有什么问题吗?
我从A Memory buffer for multiple images
获取了内存缓冲区的代码 #include "book.h"
#include "cpu_bitmap.h"
#include "cuda.h"
#include <cuda_gl_interop.h>
PFNGLBINDBUFFERARBPROC glBindBuffer = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers = NULL;
PFNGLGENBUFFERSARBPROC glGenBuffers = NULL;
PFNGLBUFFERDATAARBPROC glBufferData = NULL;
#define DIM 512
#define IMAGESIZE_MAX (DIM*DIM) // MY CHANGE
GLuint bufferObj;
cudaGraphicsResource *resource;
// based on ripple code, but uses uchar4 which is the type of data
// graphic inter op uses. see screenshot - basic2.png
__global__ void kernel( uchar4 *ptr1)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x ;
// now calculate the value at that position
float fx = x/(float)DIM - 0.5f;
float fy = y/(float)DIM - 0.5f;
unsigned char green = 128 + 127 * tan( abs(fx*100) - abs(fy*100) );
// accessing uchar4 vs unsigned char*
ptr1[offset].x = 0;
ptr1[offset].y = green;
ptr1[offset].z = 0;
ptr1[offset].w = 255;
}
// MY CODE
__global__ void kernel2( uchar4 *ptr2)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x ;
// now calculate the value at that position
float fx = x/(float)DIM - 0.5f;
float fy = y/(float)DIM - 0.5f;
unsigned char green = 128 + 127 * tan( abs(fx*100) - abs(fy*100) );
// accessing uchar4 vs unsigned char*
ptr2[offset].x = 1000;
ptr2[offset].y = green;
ptr2[offset].z = 0;
ptr2[offset].w = 255;
}
__global__ void copy ( uchar4 *pBuffer, uchar4 *Ptr, uchar4 *Ptr2, size_t size, int a )
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int idx = x + y * blockDim.x * gridDim.x ;
int bdx = idx;
if (a==1)
{
while ( idx < DIM*DIM)
{
pBuffer[idx] = Ptr[idx] ;
__syncthreads();
if (idx==DIM*DIM)
{
break;
}
}
}
if (a==2)
{
while ( (idx < DIM*DIM) && (bdx < DIM*DIM) )
{
uchar4 temp = Ptr2[bdx];
__syncthreads();
pBuffer[idx+4] = temp;
__syncthreads();
if ((idx==DIM*DIM) && (bdx==DIM*DIM))
{
break;
}
}
}
}
void key_func( unsigned char key, int x, int y )
{
switch (key)
{
case 27:
// clean up OpenGL and CUDA
( cudaGraphicsUnregisterResource( resource ) );
glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
glDeleteBuffers( 1, &bufferObj );
exit(0);
}
}
void draw_func( void ) {
// we pass zero as the last parameter, because out bufferObj is now
// the source, and the field switches from being a pointer to a
// bitmap to now mean an offset into a bitmap object
glDrawPixels( DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
glutSwapBuffers();
}
int main( int argc, char **argv )
{
cudaDeviceProp prop;
int dev;
memset( &prop, 0, sizeof( cudaDeviceProp ) );
prop.major = 1;
prop.minor = 0;
( cudaChooseDevice( &dev, &prop ) );
// tell CUDA which dev we will be using for graphic interop
// from the programming guide: Interoperability with OpenGL
// requires that the CUDA device be specified by
// cudaGLSetGLDevice() before any other runtime calls.
( cudaGLSetGLDevice( dev ) );
// these GLUT calls need to be made before the other OpenGL
// calls, else we get a seg fault
glutInit( &argc, argv );
glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
glutInitWindowSize( DIM, DIM );
glutCreateWindow( "bitmap" );
glBindBuffer = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
glGenBuffers = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
glBufferData = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");
// the first three are standard OpenGL, the 4th is the CUDA reg
// of the bitmap these calls exist starting in OpenGL 1.5
glGenBuffers( 1, &bufferObj );
glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4 ,NULL, GL_DYNAMIC_DRAW_ARB );
// REGISTER THE GL BufferObj and CUDA Resource
( cudaGraphicsGLRegisterBuffer( &resource, bufferObj, cudaGraphicsMapFlagsNone ) );
// do work with the memory dst being on the GPU, gotten via mapping
HANDLE_ERROR( cudaGraphicsMapResources( 1, &resource, NULL ) );
// MY MODIFIED CODE
uchar4 *devPtr;
size_t size;
size_t sizeTotal = 0;
cudaMalloc ( (uchar4 **)&devPtr, size);
uchar4 *devPtr2;
cudaMalloc ( (uchar4 **)&devPtr2, size);
uchar4 *pBuffer;
(cudaMalloc ( (uchar4 **)&pBuffer, size));
uchar4 *pBufferCurrent;
(cudaMalloc ( (uchar4 **)&pBufferCurrent, size));
uchar4 *pBufferImage;
(cudaMalloc ( (uchar4 **)&pBufferImage, size));
// REGISTER THE C BUFFER and CUDA Resource
HANDLE_ERROR(
cudaGraphicsResourceGetMappedPointer( (void**)&pBufferImage,
&size,
resource) );
dim3 grids(DIM/16,DIM/16);
dim3 threads(16,16);
kernel<<<grids,threads>>>(devPtr);
kernel2<<<grids,threads>>>(devPtr2);
int a = 1;
do
{
if (a==1)
{
copy<<< grids, threads>>>(pBufferImage, devPtr, devPtr2, size, a);
}
if(a==2)
{
copy<<< grids, threads>>>(pBufferImage, devPtr, devPtr2, size, a);
}
a++;
} while (a<=2);
cudaGraphicsUnmapResources( 1, &resource, NULL ) );
// set up GLUT and kick off main loop
glutKeyboardFunc( key_func );
glutDisplayFunc( draw_func );
glutMainLoop();
}
答案 0 :(得分:2)
首先对所有cuda API调用(例如cudaMemcpy等)和内核调用执行适当的cuda error checking。
当你这样做时,你会发现你的内核没有成功运行。这些类型的东西不起作用:
uchar4 *devPtr; // you've just created an unallocated NULL host pointer
size_t img1_size = IMAGESIZE_MAX;
kernel<<<grids,threads>>>(devPtr); // this kernel will fail
uchar4 *devPtr2; // you've just created an unallocated NULL host pointer
size_t img2_size = IMAGESIZE_MAX;
kernel2<<<grids,threads>>>(devPtr2); // this kernel will fail
上面代码中的 devPtr
和devPtr2
是NULL指针。您尚未分配与其关联的任何存储。此外,由于您将它们传递给设备内核,因此需要使用cudaMalloc
或类似的API函数进行分配,以使指针在设备代码中可用。
由于它们未分配cudaMalloc
,因此只要您尝试取消引用设备代码中的指针,就会产生内核错误。如果您进行错误检查,这将是显而易见的,因为您将有“未指定的启动失败”或来自这些内核的类似报告。
我认为您的代码中可能存在许多其他问题,但首先应该进行适当的cuda错误检查,并至少让代码达到您编写的所有内容实际上正在运行的程度。
你发布的代码实际上并没有编译。
修复编译错误后,我还发现你有另一个无限循环:
cudaMalloc ( (uchar4 **)&pBufferCurrent, sizeTotal + sizeof(size) + size);
cudaMalloc ( (uchar4 **)&pBuffer, sizeTotal + sizeof(size) + size);
do
{
if (!pBufferCurrent)
{
break;
}
pBuffer = pBufferCurrent;
pBufferCurrent += sizeTotal;
imageget ( pBufferCurrent + sizeof(size), size, devPtr);
sizeTotal += (sizeof(size) + size);
} while (a==1);
由于a
在循环中初始化为1,并且循环中没有任何内容修改a
,因此循环将永远不会基于while
条件退出。由于pBufferCurrent在cudaMalloc
正确设置后也永远不会为零,因此永远不会采用break
。
如果你malloc
或cudaMalloc
指的是pBufferCurrent
,我很难想象在什么情况下这会有意义:
pBufferCurrent += sizeTotal;
虽然这是合法的,但我看不出这有多大意义:
pBuffer = pBufferCurrent;
您刚刚使用pBuffer
为cudaMalloc
创建了分配,但您要做的第一件事就是扔掉它?