我有以下内核
__global__ void filter(unsigned char *image, unsigned char *out, int n, int m)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int offset = x + y * blockDim.x * gridDim.x;
int sumx, sumy, sumz, k, l;
__shared__ float shared[16][16];
shared[threadIdx.x][threadIdx.y] = image[offset];
out[offset] = shared[threadIdx.x][threadIdx.y];
}
我称之为filter<<<dimGrid, dimBlock>>>(dev_image, dev_out, n, m);
。
奇怪的是,即使我对内核的调用进行注释并进行编译,图像也保持不变。知道为什么会这样吗?是不是释放了gpu上的内存?
void Draw()
{
unsigned char *image, *out;
int n, m;
unsigned char *dev_image, *dev_out;
image = readppm("maskros512.ppm", &n, &m);
out = (unsigned char*) malloc(n*m*3);
printf("%d %d\n",n,m );
cudaMalloc( (void**)&dev_image, n*m*3);
cudaMalloc( (void**)&dev_out, n*m*3);
cudaMemcpy( dev_image, image, n*m*3, cudaMemcpyHostToDevice);
dim3 threads( 1, 256 );
dim3 blocks( 32, 32 );
filter<<<blocks, threads>>>(dev_image, dev_out, n, m);
cudaMemcpy( out, dev_out, n*m*3, cudaMemcpyDeviceToHost );
cudaFree(dev_image);
cudaFree(dev_out);
glClearColor( 0.0, 0.0, 0.0, 1.0 );
glClear( GL_COLOR_BUFFER_BIT );
glRasterPos2f(-1, -1);
glDrawPixels( n, m, GL_RGB, GL_UNSIGNED_BYTE, image );
glRasterPos2i(0, -1);
glDrawPixels( n, m, GL_RGB, GL_UNSIGNED_BYTE, out );
glFlush();
}
答案 0 :(得分:1)
如果您只是注释掉filter
行,则无法填充dev_out
。因此,如果您将dev_out
复制到out
,那么您将获得垃圾,这可能是最后一次dev_out
中的任何内容。
这些行不对:
dim3 threads( 1, 256 );
dim3 blocks( 32, 32 );
您正在启动线程块,该线程块是x中的1个线程,y中包含256个线程。这对你的内核没有意义。你的内核需要每个像素启动一个线程,并且它期望x和y中的足够的线程数组以像素为单位覆盖图像空间。此外,您的共享内存分配期望一个16x16的线程块。试试这个:
dim3 threads(16,16);
dim3 blocks((n+threads.x-1)/threads.x, (m+threads.y-1)/threads.y);
此外,您的图像似乎由3字节像素组成。但是你每个像素只启动一个线程。因此,您需要复制每个像素3个字节,而不是一个。像这样:
#define RED 0
#define GRN 1
#define BLU 2
__global__ void filter(unsigned char *image, unsigned char *out, int n, int m)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int offset = x + y * blockDim.x * gridDim.x;
// the above numbers are all pixel dimensions. To convert to byte dimensions,
// we must multiply by 3
int sumx, sumy, sumz, k, l;
__shared__ unsigned char shared[16][16*3];
shared[threadIdx.x][(threadIdx.y*3)+RED] = image[(offset*3)+RED]; // pick up red
shared[threadIdx.x][(threadIdx.y*3)+GRN] = image[(offset*3)+GRN]; // pick up green
shared[threadIdx.x][(threadIdx.y*3)+BLU] = image[(offset*3)+BLU]; // pick up blue
out[(offset*3)+RED] = shared[threadIdx.x][(threadIdx.y*3)+RED];
out[(offset*3)+GRN] = shared[threadIdx.x][(threadIdx.y*3)+GRN];
out[(offset*3)+BLU] = shared[threadIdx.x][(threadIdx.y*3)+BLU];
}
最后你应该做正确的cuda error checking