所以我正在研究矩阵上的reduce函数:我需要在矩阵中找到最大值。我已经实现了一个函数来获取数组上的最大值,并将其转换为矩阵版本应该很简单,但是我无法使其正常工作。我想知道这是否是正确的方法。您可以在下面找到这两个版本的代码:
对于数组:
__global__
void reduce_kernal_shared_mem(float *d_in, float *d_out){
int indx = blockDim.x * blockIdx.x + threadIdx.x;
int tindx = threadIdx.x;
extern __shared__ float sh_in[];
sh_in[tindx] = -99999.0f;
sh_in[tindx] = d_in[indx];
__syncthreads();
for(int i = blockDim.x / 2; i > 0; i >>= 1){
if(tindx < i){
sh_in[tindx] = fmax(sh_in[tindx], sh_in[tindx + i]);
}
__syncthreads();
}
if(tindx == 0){
d_out[blockIdx.x] = sh_in[0];
}
}
void reduce(float *d_in, float *d_int, float *d_out, const int ARRAY_SIZE, bool is_shared){
if(!is_shared){
reduce_kernal<<<1024, 1024>>>(d_in, d_int);
reduce_kernal<<<1, 1024>>>(d_int, d_out);
}else{
reduce_kernal_shared_mem<<<1024, 1024, 1024 * sizeof(float)>>>(d_in, d_int);
reduce_kernal_shared_mem<<<1, 1024, 1024 * sizeof(float)>>>(d_int, d_out);
}
}
对于矩阵:
__global__
void get_max(const float* d_logLuminance, float *d_out, int numRows, int numCols){
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int c_t = threadIdx.x;
int r_t = threadIdx.y;
int pos_1D = row * numCols + col;
int pos_1D_t = r_t * blockDim.x + c_t;
extern __shared__ float sh_mem[];
sh_mem[pos_1D_t] = -999999.0f;
if(pos_1D > numCols * numRows)
return;
sh_mem[pos_1D_t] = d_logLuminance[pos_1D];
__syncthreads();
for(int s = (blockDim.x * blockDim.y) / 2; s > 0; s >>= 1){
if(pos_1D_t < s)
sh_mem[pos_1D_t] = fmax(sh_mem[pos_1D_t], sh_mem[pos_1D_t + s]);
__syncthreads();
}
if(r_t == 0 && c_t == 0)
d_out[blockIdx.y * gridDim.x + blockIdx.x] = sh_mem[0];
}
void max(const float *d_logLuminance, int numRows, int numCols, float &max_logLum){
int THREADS_PER_BLOCK = 32;
dim3 blockSize(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
dim3 gridSize((THREADS_PER_BLOCK + numCols - 1) / THREADS_PER_BLOCK,
(THREADS_PER_BLOCK + numRows - 1) / THREADS_PER_BLOCK);
float *d_out, *d_int;
cudaMalloc(&d_out, sizeof(float) * numRows * numCols);
cudaMalloc(&d_int, sizeof(float) * numRows * numCols);
get_max<<<gridSize, blockSize, THREADS_PER_BLOCK * THREADS_PER_BLOCK * sizeof(float)>>>(d_logLuminance, d_int, numRows, numCols);
get_max<<<1, blockSize, THREADS_PER_BLOCK * THREADS_PER_BLOCK * sizeof(float)>>>(d_int, d_out, numRows, numCols);
cudaDeviceSynchronize();
cudaMemcpy(&max_logLum, d_out, sizeof(float), cudaMemcpyDeviceToHost);
printf("max : %f\n", max_logLum);
cudaFree(d_out);
cudaFree(d_int);
}
通过串行算法计算的预期结果为2.18911,而并行归约函数输出为1.319142。
答案 0 :(得分:4)
您显示的所有代码均未针对性能进行最佳优化。在GPU上编写快速并行约简通常会具有许多明显的特征:
因此,要解决内核中的技术问题,请记住这是一次学习练习。我并不是说您的方法(两者之一)都是最好的方法。
在2D内核方法中,您既要考虑非法行为,又要考虑设计问题:
此构造:
if(pos_1D > numCols * numRows)
return; // this return statement creates a hazard
sh_mem[pos_1D_t] = d_logLuminance[pos_1D];
__syncthreads(); // ... at this call
允许出现不确定行为。 CUDA要求线程块中的所有线程都达到__syncthreads()
。但是return
语句可能允许某些线程块中的某些线程提前退出,这意味着它们将不参与对__syncthreads()
的调用。这是threadfence reduction。
您的2D设计将数据集的最大大小限制为32x32块或1024x1024数据集大小。要了解原因,请观察第二个内核启动最多可以有1024个线程,并且由于一个线程对应于前一次启动中的一个线程块,因此前一次启动最多可以有32x32 = 1024个线程块。可以对代码进行重新设计以消除此限制,但是我的目的是建议,如果您想编写快速,强大的并行缩减,则必须对设计的几乎每个方面都进行重新设计,因此,我建议从我概述的特性开始一开始。
您的第二个内核启动结构不正确:
get_max<<<1, blockSize, THREADS_PER_BLOCK * THREADS_PER_BLOCK * sizeof(float)>>>(d_int, d_out, numRows, numCols);
此时数据集的大小不再为numRows
* numCols
。它已减小到32x32或更小的值。您的代码中还有其他几个类似的大小调整问题。
这是您代码的修改版本,解决了上述问题。我没有详细说明对您的代码所做的每一个更改,因此,除了上面列出的3个项目之外,请研究区别:
$ cat t1490.cu
#include <stdio.h>
__global__
void get_max(const float* d_logLuminance, float *d_out, int numRows, int numCols){
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int c_t = threadIdx.x;
int r_t = threadIdx.y;
int pos_1D = row * numCols + col;
int pos_1D_t = r_t * blockDim.x + c_t;
extern __shared__ float sh_mem[];
sh_mem[pos_1D_t] = (pos_1D >= numCols * numRows)?-999999.0f:d_logLuminance[pos_1D];
__syncthreads();
for(int s = (blockDim.x * blockDim.y) / 2; s > 0; s >>= 1){
if(pos_1D_t < s)
sh_mem[pos_1D_t] = fmax(sh_mem[pos_1D_t], sh_mem[pos_1D_t + s]);
__syncthreads();
}
if(r_t == 0 && c_t == 0)
d_out[blockIdx.y * gridDim.x + blockIdx.x] = sh_mem[0];
}
void max(const float *d_logLuminance, int numRows, int numCols, float &max_logLum){
int THREADS_PER_BLOCK = 32;
dim3 blockSize(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
dim3 gridSize((THREADS_PER_BLOCK + numCols - 1) / THREADS_PER_BLOCK,
(THREADS_PER_BLOCK + numRows - 1) / THREADS_PER_BLOCK);
float *d_out, *d_int;
cudaMalloc(&d_out, sizeof(float));
cudaMalloc(&d_int, sizeof(float) * gridSize.y*gridSize.x);
get_max<<<gridSize, blockSize, THREADS_PER_BLOCK * THREADS_PER_BLOCK * sizeof(float)>>>(d_logLuminance, d_int, numRows, numCols);
get_max<<<1, blockSize, THREADS_PER_BLOCK * THREADS_PER_BLOCK * sizeof(float)>>>(d_int, d_out, gridSize.y, gridSize.x);
cudaDeviceSynchronize();
cudaMemcpy(&max_logLum, d_out, sizeof(float), cudaMemcpyDeviceToHost);
printf("max : %f\n", max_logLum);
cudaFree(d_out);
cudaFree(d_int);
}
int main(){
int sx = 1024;
int sy = 1024;
float *d_data, result = 2.18911;
cudaMalloc(&d_data, sx*sy*sizeof(d_data[0]));
cudaMemset(d_data, 0, sx*sy*sizeof(d_data[0]));
cudaMemcpy(d_data, &result, sizeof(float), cudaMemcpyHostToDevice);
result = 0;
max(d_data, sy, sx, result);
}
$ nvcc -o t1490 t1490.cu
$ cuda-memcheck ./t1490
========= CUDA-MEMCHECK
max : 2.189110
========= ERROR SUMMARY: 0 errors
$
请注意,您尚未提供完整的代码。总是有可能在未显示的代码中也有错误。在我的回答中,我提供了一个示例代码。