我想做的事情非常简单。 每个线程从存储在全局内存中的全局数组中读取子数组。 然后它做一些计算并将结果存储在静态数组中。 最后,输出存储在全局内存中的另一个数组中 当我评论将静态数组写入全局数组的行时,内核运行。如代码所示。 任何想法?
GPU内核:
#ifndef _TEMPLATE_KERNEL_H_
#define _TEMPLATE_KERNEL_H_
#include <stdio.h>
__device__ void
DecompressBlockGPU(unsigned char *compressed_block,unsigned char *compressed_size,
int array_length,unsigned char *decompressed_block)
{
int j = 0;
for(int i = 0 ; i < array_length ;i++)
{
for(int idx = 0 ; idx < compressed_size[i]; idx++)
{
decompressed_block[j] = compressed_block[i];
j++;
}
}
}
__global__ void
gpu_test(unsigned char *compressed_data,int *OffsetsArray,int xBlocks,int yBlocks,
unsigned char *output, int BlockSize,int BlockWidth,int BlockHeight,
int cols,int xTB,int yTB,int xTH,int yTH,unsigned char *aux_array)
{
int x_max = xBlocks ;
int y_max = yBlocks ;
int x_block = blockIdx.x ;
int y_block = blockIdx.y ;
x_max = gridDim.x*blockDim.x ;
y_max = gridDim.y*blockDim.y ;
x_block = (blockIdx.x*xTH);
y_block = (blockIdx.y*yTH);
int x_block1 = x_block + threadIdx.x;
int y_block1 = y_block + threadIdx.y;
int block_idx = y_block1*xBlocks + x_block1;
unsigned char *temp_ptr = compressed_data + OffsetsArray[block_idx];
int *array_length = (int *)temp_ptr;
unsigned char *compressed_size = compressed_data + OffsetsArray[block_idx] +
array_length[0] +sizeof(int)/sizeof(unsigned char);
unsigned char *compressed_block = compressed_data + OffsetsArray[block_idx] +
sizeof(int)/sizeof(unsigned char);
aux_array = aux_array + (BlockWidth+2)*(BlockHeight+2)*block_idx;
aux_array[block_idx]=array_length[0];
unsigned char decompressed_block[72];
unsigned char extracted_block[32];
DecompressBlockGPU(compressed_block,compressed_size,array_length[0],
&decompressed_block[0]);
if(block_idx == 0)
{
for(int i=0;i<16;i++) aux_array[i]= decompressed_block[i]; //fails
for(int i=16;i<16*36;i++) aux_array[i]=1;//works
}
}
#endif
CPU功能:
unsigned char *runGPU(unsigned char *d_compressed_data,int *OffsetsArray,int xBlocks,int yBlocks,unsigned char *h_output)
{
printf("xBlocks =%d yBlocks =%d \n",xBlocks,yBlocks);
int xTB = 4;
int yTB = 4;
int xTH = 1;
int yTH = 1;
unsigned char *d_output;
unsigned char *d_aux_array;
unsigned char *h_aux_array;
int mem_size = image_len*sizeof(unsigned char);
int big_mem_size = sizeof(unsigned char)*xBlocks*yBlocks*(BlockWidth+2)*(BlockHeight+2);
cutilSafeCall( cudaMalloc( (void**) &d_output, mem_size));
cutilSafeCall( cudaMalloc( (void**) &d_aux_array,big_mem_size));
h_aux_array = (unsigned char *)malloc(big_mem_size);
float time = 0;
float totalTime = 0;
cudaEvent_t start_event4, stop_event4;
cutilSafeCall( cudaEventCreate(&start_event4) );
cutilSafeCall( cudaEventCreate(&stop_event4) );
cutilSafeCall( cudaEventRecord(start_event4, 0) );
dim3 grid(xTB,yTB, 1);
dim3 threads( xTH, yTH, 1);
gpu_test<<<grid,threads>>>(d_compressed_data,OffsetsArray,xBlocks,yBlocks,d_output,BlockSize,BlockWidth,BlockHeight,cols,xTB,yTB,xTH,yTH,d_aux_array);
cudaThreadSynchronize();
cutilSafeCall( cudaEventRecord(stop_event4, 0) );
cutilSafeCall( cudaEventSynchronize(stop_event4) );
time = 0;
cutilSafeCall( cudaEventElapsedTime(&time, start_event4, stop_event4));
totalTime += time;
totalTime /= (1.0e3 * 1);
shrLogEx(LOGBOTH | MASTER, 0, "GPU decompression Time = %.5f \n",totalTime);
cutilSafeCall(cudaMemcpy(h_output,d_output, mem_size, cudaMemcpyDeviceToHost));
cutilSafeCall(cudaMemcpy(h_aux_array,d_aux_array, big_mem_size, cudaMemcpyDeviceToHost));
cudaFree(d_output);
cudaFree(d_aux_array);
return h_aux_array;
}
现在是否清楚?(编辑后)