我需要对图像进行下采样。有些读数提示我,如果我使用纹理存储器,那么这个函数是免费的并且更快(我正在寻找双线性插值)。可以有人告诉我如何编写内核为此?这是我目前的情况: (我使用(1,1)线程块)
__global__ void texturekernel( int * final_red){
int f = (blockIdx.x * blockDim.x) + threadIdx.x;
int c = (blockIdx.y * blockDim.y) + threadIdx.y;
int id=blockIdx.x+256*blockIdx.y;//256 is the width of downsampled image ..original was 512
final_red[id]=tex2D( refTexture,c+0.5f,f+0.5f);//This is just for the red channel
//where reftexture is defined as texture <float, 2, cudaReadModeElementType> refTexture;
};
这个版本目前在输出中给我全部0。
已编辑(在此版本中,我尝试将2张2000 * 512尺寸的图片缩减为2张1000 * 256):
texture <float, 2, cudaReadModeElementType> refTexture; // global variable !
cudaArray* myArray;
cudaChannelFormatDesc description = cudaCreateChannelDesc<float>();
cudaError rs=cudaMallocArray ( &myArray,&description, 512,2000*2);//
//This line below is part of loop where input image is read row by row ..rowchecker keeps track of the row
cudaMemcpyToArray(myArray,0,rowchecker++,array_temp_red,sizeof(int)*test_columns,cudaMemcpHostToDevice);
refTexture.normalized=false;
refTexture.addressMode[0]=cudaAddressModeClamp;
refTexture.addressMode[1]=cudaAddressModeClamp;
refTexture.filterMode=cudaFilterModePoint;
cudaBindTextureToArray( refTexture,myArray);
dim3 blockSize(1,1);
int n_blocks_x=256;
int n_blocks_y=1000*2;
dim3 gridSize(n_blocks_x,n_blocks_y);
cudaMalloc((void**)&finalarray,(2000)*(512)*2/4*sizeof(int));
texturekernel<<<gridSize,blockSize>>>(finalarray );
答案 0 :(得分:1)
int id = blockIdx.x + 256 * blockIdx.y;
此声明超越了final_red的限制。
试试这个:
__global__ void texturekernel( int * final_red){
int f = blockIdx.x * blockDim.x + threadIdx.x;
int c = blockIdx.y * blockDim.y + threadIdx.y;
int id =c/2 * 256 + f/2;
final_red[id] = tex2D( refTexture,c+0.5f,f+0.5f);
}