我正在进行关于并行编程(作业3)的Udacity课程,并且无法弄清楚为什么我无法使用并行缩减(Udacity论坛尚未提供解决方案)在阵列中获得最大值。我很确定我已正确设置数组并且算法正确。我怀疑我有内存管理问题(访问越界,不正确的数组大小,复制到和来)。请帮忙!我在Udacity环境中运行它,而不是在本地运行。以下是我目前使用的代码。出于某种原因,当我将fmaxf
更改为fminf
时,它确实找到了最小值。
#include "reference_calc.cpp"
#include "utils.h"
#include "math.h"
#include <stdio.h>
#include <cmath>
__global__ void reduce_max_kernel(float *d_out, const float *d_logLum, int size) {
// Reduce log Lum with Max Operator
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
extern __shared__ float temp[];
if (myId < size) {
temp[tid] = d_logLum[myId];
}
else {
temp[tid] = d_logLum[tid];
}
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
if (myId < size) {
temp[tid] = fmaxf(d_logLum[myId + s], d_logLum[myId]);
} else {
temp[tid] = d_logLum[tid];
}
}
__syncthreads();
}
if (tid == 0) {
d_out[blockIdx.x] = temp[0];
}
}
__global__ void reduce_max_kernel2(float *d_out, float *d_in) {
// Reduce log Lum with Max Operator
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
for (unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
if (tid < s) {
d_in[myId] = fmaxf(d_in[myId + s], d_in[myId]);
}
__syncthreads();
}
if (tid == 0) {
d_out[0] = d_in[0];
}
}
void your_histogram_and_prefixsum(const float* const d_logLuminance,
unsigned int* const d_cdf,
float &min_logLum,
float &max_logLum,
const size_t numRows,
const size_t numCols,
const size_t numBins)
{
//TODO
/*Here are the steps you need to implement
1) find the minimum and maximum value in the input logLuminance channel
store in min_logLum and max_logLum
2) subtract them to find the range
3) generate a histogram of all the values in the logLuminance channel using
the formula: bin = (lum[i] - lumMin) / lumRange * numBins
4) Perform an exclusive scan (prefix sum) on the histogram to get
the cumulative distribution of luminance values (this should go in the
incoming d_cdf pointer which already has been allocated for you) */
//int size = 1 << 18;
int points = numRows * numCols;
int logPoints = ceil(log(points)/log(2));
int sizePow = logPoints;
int size = pow(2, sizePow);
int numThreads = 1024;
int numBlocks = size / numThreads;
float *d_out;
float *d_max_out;
checkCudaErrors(cudaMalloc((void **) &d_out, numBlocks * sizeof(float)));
checkCudaErrors(cudaMalloc((void **) &d_max_out, sizeof(float)));
cudaDeviceSynchronize();
reduce_max_kernel<<<numBlocks, numThreads, sizeof(float)*numThreads>>>(d_out, d_logLuminance, points);
cudaDeviceSynchronize();
reduce_max_kernel2<<<1, numBlocks>>>(d_max_out, d_out);
float h_out_max;
checkCudaErrors(cudaMemcpy(&h_out_max, d_max_out, sizeof(float), cudaMemcpyDeviceToHost));
printf("%f\n", h_out_max);
checkCudaErrors(cudaFree(d_max_out));
checkCudaErrors(cudaFree(d_out));
}
答案 0 :(得分:3)
您正在尝试重现CUDA SDK缩减示例的reduce2
缩减内核。 Robert Crovella已经发现了你在代码中犯的两个错误。除此之外,我认为你也错误地初始化了共享内存。
下面,请找到围绕您的尝试构建的完整工作示例。我给你的方法留下了错误的指示。
#include <thrust\device_vector.h>
#define BLOCKSIZE 256
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/*******************************************************/
/* CALCULATING THE NEXT POWER OF 2 OF A CERTAIN NUMBER */
/*******************************************************/
unsigned int nextPow2(unsigned int x)
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
__global__ void reduce_max_kernel(float *d_out, const float *d_logLum, int size) {
int tid = threadIdx.x; // Local thread index
int myId = blockIdx.x * blockDim.x + threadIdx.x; // Global thread index
extern __shared__ float temp[];
// --- Loading data to shared memory. All the threads contribute to loading the data to shared memory.
temp[tid] = (myId < size) ? d_logLum[myId] : -FLT_MAX;
// --- Your solution
// if (myId < size) { temp[tid] = d_logLum[myId]; } else { temp[tid] = d_logLum[tid]; }
// --- Before going further, we have to make sure that all the shared memory loads have been completed
__syncthreads();
// --- Reduction in shared memory. Only half of the threads contribute to reduction.
for (unsigned int s=blockDim.x/2; s>0; s>>=1)
{
if (tid < s) { temp[tid] = fmaxf(temp[tid], temp[tid + s]); }
// --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
__syncthreads();
}
// --- Your solution
//for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
// if (tid < s) { if (myId < size) { temp[tid] = fmaxf(d_logLum[myId + s], d_logLum[myId]); } else { temp[tid] = d_logLum[tid]; } }
// __syncthreads();
//}
if (tid == 0) {
d_out[blockIdx.x] = temp[0];
}
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 10;
thrust::device_vector<float> d_vec(N,3.f); d_vec[4] = 4.f;
int NumThreads = (N < BLOCKSIZE) ? nextPow2(N) : BLOCKSIZE;
int NumBlocks = (N + NumThreads - 1) / NumThreads;
// when there is only one warp per block, we need to allocate two warps
// worth of shared memory so that we don't index shared memory out of bounds
int smemSize = (NumThreads <= 32) ? 2 * NumThreads * sizeof(int) : NumThreads * sizeof(int);
// --- reduce2
thrust::device_vector<float> d_vec_block(NumBlocks);
reduce_max_kernel<<<NumBlocks, NumThreads, smemSize>>>(thrust::raw_pointer_cast(d_vec_block.data()), thrust::raw_pointer_cast(d_vec.data()), N);
// --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host
thrust::host_vector<float> h_vec_block(d_vec_block);
float result_reduce0 = -FLT_MAX;
for (int i=0; i<NumBlocks; i++) result_reduce0 = fmax(h_vec_block[i], result_reduce0);
printf("Result = %f\n",result_reduce0);
}