我刚开始编程CUDA。 现在,当我找到矩阵的最大值时,我遇到了问题。 我有一个矩阵,我有一个窗口大小(即2x2,4x4,8x8),我找到每个窗口大小的矩阵的最大值。
如果我的矩阵是小数字,当矩阵更大,即256x256或1024x1024时,一切都没问题,最大数组的顺序不是真的。 例如,矩阵大时的预期输出为:{4,5,9,1,2,3,7,9,9}。 但是,实际上{4,5,1,2,9,3 ....}每次运行时订单都会发生变化。
#include <stdio.h>
#include <stdlib.h>
__global__ void calculate_emax_kernel(float emap[],float emax[], int img_height, int img_width,int windows_size)
{
int x_index = blockIdx.x*blockDim.x+threadIdx.x;
int y_index = blockIdx.y*blockDim.y+threadIdx.y;
int num_row_block = img_height/windows_size;
int num_col_block = img_width/windows_size;
int index;
__shared__ float window_elements[256];
__shared__ int counter;
if(y_index >= img_height|| x_index >= img_width) return;
for(int i = 0; i < num_row_block; i++)
{
for(int j = 0; j < num_col_block; j++)
{
counter = 0;
if(y_index >= i*windows_size && y_index < (i+1)*windows_size
&& x_index >= j*windows_size && x_index < (j+1)*windows_size)
{
int idx = y_index*img_height + x_index;
index = atomicAdd(&counter, 1);
window_elements[index] = emap[idx];
__syncthreads();
//printf("row_block = %d ||col_block = %d || y_index =%d || x_index = %d || index = %d , window_elements[%d] = %d\n",i,j,y_index,x_index,(int)emap[idx],index,(int)window_elements[index]);
// reduction
unsigned int k = (windows_size*windows_size)/2;
while(k != 0)
{
if(index < k)
{
window_elements[index] = fmaxf(window_elements[index], window_elements[index+k]);
//printf ("i = %d || j = %d || k = %d ||win_idx =%d ||%d = fmaxf( %d, %d) \n",i,j,k,index,(int)fmaxf(window_elements[index], window_elements[index+k]),
// (int)window_elements[index], (int)window_elements[index+k]);
}
k /= 2;
}
if(index == 0)
{
printf ("max = %d || index = %d \n",(int)window_elements[index],i*num_row_block+j);
emax[i*num_row_block+j] = window_elements[index];
}
}
__syncthreads();
}
__syncthreads();
}
__syncthreads();
}
void max_reduction_1D(float * array, float * max,int array_height , int array_width)
{
float * d_array;
float * d_max;
int *d_mutex;
int elements = array_height*array_width;
cudaMalloc((void**)&d_array, elements*sizeof(float));
cudaMalloc((void**)&d_max, sizeof(float));
cudaMalloc((void**)&d_mutex, sizeof(int));
cudaMemset(d_mutex, 0, sizeof(float));
cudaMemcpy(d_array, array, elements*sizeof(float), cudaMemcpyHostToDevice);
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(array_width+blocksize.x-1)/blocksize.x;
gridsize.y=(array_height+blocksize.y-1)/blocksize.y;
calculate_emax_kernel<<< gridsize, blocksize >>>(d_array, d_max, array_height,array_width,8);
cudaMemcpy(max, d_max, sizeof(float), cudaMemcpyDeviceToHost);
//printf("Maximum number is: %d \n",(int)*max);
cudaFree(d_array);
cudaFree(d_max);
}
int main()
{
int array_height = 6;
int array_width = 6;
unsigned int N = array_height*array_width;
float *h_array;
float *h_max;
h_array = (float*)malloc(N*sizeof(float));
h_max = (float*)malloc(sizeof(float));
float array[] = {1, 2, 2, 3, 6, 7, 5, 6,
3, 4, 4, 5, 8, 9, 7, 8,
1, 1, 2, 2, 3, 3, 4, 4,
1, 1, 2, 2, 3, 3, 4, 4,
4, 5, 6, 6, 9, 7, 1, 2,
6, 7, 8, 9, 8, 8, 2, 3,
1, 2, 3, 4, 5, 6, 7, 8,
8, 7, 6, 5, 4, 3, 2, 1
};
for(int i = 0; i < N; i++)
{
h_array[i] = array[i];
}
//max_reduction_1D(h_array, h_max, array_height , array_width);
max_reduction_1D(h_array, h_max, 32 ,32);
free(h_array);
}