我正在做一个图像旋转方法。它需要两个基质和一定程度的旋转。它按原子数旋转原始矩阵并将其存储到旋转矩阵中。 我有以下“正常”代码(对于CPU - 取自本网站 - http://sinepost.wordpress.com/2012/07/24/image-rotation/)并且它正常工作;
static void RotateImage(unsigned char original[RAW_HEIGHT][RAW_WIDTH] , unsigned char rotated[RAW_HEIGHT][RAW_WIDTH] , int degrees)
{
double centerX = RAW_WIDTH/2;
double centerY = RAW_HEIGHT/2;
for(int x = 0; x< RAW_HEIGHT;x++)
{
for (int y = 0; y < RAW_WIDTH; y++)
{
double dir = calculateDirection(x-centerX,y-centerY);
double mag = calculateMagnitude(x-centerX,y-centerY);
dir-=degrees;
int origX = (int)(centerX + calculateX(dir,mag));
int origY = (int)(centerY + calculateY(dir,mag));
if (origX >= 0 && origX < RAW_HEIGHT && origY >= 0 && origY < RAW_WIDTH)
{
rotated[x][y] = original[origX][origY];
}
}
}
}
我想将此代码传输到CUDA代码。这是我的版本:
#define RAW_WIDTH 1600*3
#define RAW_HEIGHT 1200
unsigned char *dev_original_image;
unsigned char *dev_rotated_image;
__global__ void rotatePicture(unsigned char *original, unsigned char *rotated, int degrees)
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int offset_rotated = x + y * blockDim.x * gridDim.x;
double centerX = 2400.0;
double centerY = 600.0;
double dir = (atan2(y-centerY,x-centerX))*180/3.14159265;
double mag = sqrt((x-centerX)*(x-centerX) + (y-centerY)*(y-centerY));
dir = dir - degrees;
int origX = (int)(centerX + cos((dir*3.14159265/180)) * mag);
int origY = (int)(centerY + sin((dir*3.14159265/180)) * mag);
int offset_original = origX + origY * blockDim.x * gridDim.x;
if(offset_original > 0 && offset_original < RAW_HEIGHT*RAW_WIDTH)
*(rotated + offset_rotated) = *(original + offset_original);
}
但它没有给我与CPU部分相同的结果。 我认为问题在于传递CUDA kerenl的论点。我将它们作为2D数组传递,这样可以吗?谁可以给我解释一下这个? 这是我的kerenl配置并致电:
dim3 BlockPerGrid(450,400,1);
dim3 ThreadsPerGrid(8,4,1);
cudaMalloc((void**)&dev_original_image,sizeof(unsigned char)*RAW_HEIGHT*RAW_WIDTH);
cudaMalloc((void**)&dev_rotated_image,sizeof(unsigned char)*RAW_HEIGHT*RAW_WIDTH);
cudaMemcpy(dev_original_image, raw_image2D, sizeof(unsigned char)*RAW_HEIGHT*RAW_WIDTH,cudaMemcpyHostToDevice);
cudaMemcpy(dev_rotated_image, raw_image2D_rotated, sizeof(unsigned char)*RAW_HEIGHT*RAW_WIDTH, cudaMemcpyHostToDevice);
rotatePicture<<<BlockPerGrid,ThreadsPerGrid>>>(dev_original_image,dev_rotated_image, deg);
感谢您的建议!
注意:我修改了代码并且工作得更好但仍然不正确。
答案 0 :(得分:1)
这是潜伏在这些水域中的其他解决方案。 这是我正确的内核:
__global__ void rotatePicture(unsigned char *original, unsigned char *rotated, int degrees)
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int offset_rotated = x + y * blockDim.x * gridDim.x;
double centerX = 2400.0;
double centerY = 600.0;
double dir = (atan2(x-centerX,y-centerY))*180/3.14159265;
double mag = sqrt((x-centerX)*(x-centerX) + (y-centerY)*(y-centerY));
dir = dir - degrees;
int origX = (int)(centerX + sin((dir*3.14159265/180)) * mag);
int origY = (int)(centerY + cos((dir*3.14159265/180)) * mag);
int offset_original = origX + origY * blockDim.x * gridDim.x;
if(origX > 0 && origX < RAW_WIDTH && origY > 0 && origY < RAW_HEIGHT)
*(rotated + offset_rotated) = *(original + offset_original);
}
另外,我改变了这样的内核尺寸(以容纳我的1600 * 3宽度和1200高度):
dim3 BlockPerGrid(600,300,1);
dim3 ThreadsPerGrid(8,4,1);
因此,它的运行方式与上面的CPU版本相同,但使用的是GPU资源。享受