之前的一个问题是如何找到有效地在CUDA中找到数组的最大值:Finding max value in CUDA,最高响应提供了一个关于优化减少内核的NVIDIA演示文稿的链接。
如果您使用的是Visual Studio,只需删除标题引用,以及 CPU EXECUTION 之间的所有内容。
我设置了一个找到max的变体,但它与CPU找不到的相符:
// Returns the maximum value of
// an array of size n
float GetMax(float *maxes, int n)
{
int i = 0;
float max = -100000;
for(i = 0; i < n; i++)
{
if(maxes[i] > max)
max = maxes[i];
}
return max;
}
// Too obvious...
__device__ float MaxOf2(float a, float b)
{
if(a > b) return a;
else return b;
}
__global__ void MaxReduction(int n, float *g_idata, float *g_odata)
{
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(BLOCKSIZE*2) + tid;
unsigned int gridSize = BLOCKSIZE*2*gridDim.x;
sdata[tid] = 0;
//MMX(index,i)
//MMX(index,i+blockSize)
// Final Optimized Kernel
while (i < n) {
sdata[tid] = MaxOf2(g_idata[i], g_idata[i+BLOCKSIZE]);
i += gridSize;
}
__syncthreads();
if (BLOCKSIZE >= 512) { if (tid < 256) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
if (BLOCKSIZE >= 256) { if (tid < 128) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
if (BLOCKSIZE >= 128) { if (tid < 64) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 64]); } __syncthreads(); }
if (tid < 32) {
if (BLOCKSIZE >= 64) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 32]);
if (BLOCKSIZE >= 32) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 16]);
if (BLOCKSIZE >= 16 ) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 8]);
if (BLOCKSIZE >= 8) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 4]);
if (BLOCKSIZE >= 4) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 2]);
if (BLOCKSIZE >= 2) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 1]);
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
我有一个巨大的设置来测试这个算法:
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <sys/time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include "book.h"
#define ARRAYSIZE 16384
#define GRIDSIZE 60
#define BLOCKSIZE 32
#define SIZEFLOAT 4
using namespace std;
// Function definitions
float GetMax(float *maxes, int n);
__device__ float MaxOf2(float a, float b);
__global__ void MaxReduction(int n, float *g_idata, float *g_odata);
// Returns random floating point number
float RandomReal(float low, float high)
{
float d;
d = (float) rand() / ((float) RAND_MAX + 1);
return (low + d * (high - low));
}
int main()
{
/*****************VARIABLE SETUP*****************/
// Pointer to CPU numbers
float *numbers;
// Pointer to GPU numbers
float *dev_numbers;
// Counter
int i = 0;
// Randomize
srand(time(0));
// Timers
// Kernel timers
cudaEvent_t start_kernel, stop_kernel;
float elapsedTime_kernel;
HANDLE_ERROR(cudaEventCreate(&start_kernel));
HANDLE_ERROR(cudaEventCreate(&stop_kernel));
// cudaMalloc timers
cudaEvent_t start_malloc, stop_malloc;
float elapsedTime_malloc;
HANDLE_ERROR(cudaEventCreate(&start_malloc));
HANDLE_ERROR(cudaEventCreate(&stop_malloc));
// CPU timers
struct timeval start, stop;
float elapsedTime = 0;
/*****************VARIABLE SETUP*****************/
/*****************CPU ARRAY SETUP*****************/
// Setup CPU array
HANDLE_ERROR(cudaHostAlloc((void**)&numbers, ARRAYSIZE * sizeof(float), cudaHostAllocDefault));
for(i = 0; i < ARRAYSIZE; i++)
numbers[i] = RandomReal(0, 50000.0);
/*****************CPU ARRAY SETUP*****************/
/*****************GPU ARRAY SETUP*****************/
// Start recording cuda malloc time
HANDLE_ERROR(cudaEventRecord(start_malloc,0));
// Allocate memory to GPU
HANDLE_ERROR(cudaMalloc((void**)&dev_numbers, ARRAYSIZE * sizeof(float)));
// Transfer CPU array to GPU
HANDLE_ERROR(cudaMemcpy(dev_numbers, numbers, ARRAYSIZE*sizeof(float), cudaMemcpyHostToDevice));
// An array to temporarily store maximum values on the GPU
float *dev_max;
HANDLE_ERROR(cudaMalloc((void**)&dev_max, GRIDSIZE * sizeof(float)));
// An array to hold grab the GPU max
float maxes[GRIDSIZE];
/*****************GPU ARRAY SETUP*****************/
/*****************KERNEL EXECUTION*****************/
// Start recording kernel execution time
HANDLE_ERROR(cudaEventRecord(start_kernel,0));
// Run kernel
MaxReduction<<<GRIDSIZE, BLOCKSIZE, SIZEFLOAT*BLOCKSIZE>>> (ARRAYSIZE, dev_numbers, dev_max);
// Transfer maxes over
HANDLE_ERROR(cudaMemcpy(maxes, dev_max, GRIDSIZE * sizeof(float), cudaMemcpyDeviceToHost));
// Print out the max
cout << GetMax(maxes, GRIDSIZE) << endl;
// Stop recording kernel execution time
HANDLE_ERROR(cudaEventRecord(stop_kernel,0));
HANDLE_ERROR(cudaEventSynchronize(stop_kernel));
// Retrieve recording data
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime_kernel, start_kernel, stop_kernel));
// Stop recording cuda malloc time
HANDLE_ERROR(cudaEventRecord(stop_malloc,0));
HANDLE_ERROR(cudaEventSynchronize(stop_malloc));
// Retrieve recording data
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime_malloc, start_malloc, stop_malloc));
// Print results
printf("%5.3f\t%5.3f\n", elapsedTime_kernel, elapsedTime_malloc);
/*****************KERNEL EXECUTION*****************/
/*****************CPU EXECUTION*****************/
// Capture the start time
gettimeofday(&start, NULL);
// Call generic P7Viterbi function
cout << GetMax(numbers, ARRAYSIZE) << endl;
// Capture the stop time
gettimeofday(&stop, NULL);
// Retrieve time elapsed in milliseconds
long int elapsed_sec = stop.tv_sec - start.tv_sec;
long int elapsed_usec = stop.tv_usec - start.tv_usec;
elapsedTime = (float)(1000.0f * elapsed_sec) + (float)(0.001f * elapsed_usec);
// Print results
printf("%5.3f\n", elapsedTime);
/*****************CPU EXECUTION*****************/
// Free memory
cudaFreeHost(numbers);
cudaFree(dev_numbers);
cudaFree(dev_max);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaEventDestroy(start_malloc);
cudaEventDestroy(stop_malloc);
// Exit program
return 0;
}
我在这个测试程序上运行cuda-memcheck,使用-g&amp; -G打开,报告0个问题。有谁能发现这个问题?
注意:编译程序时,请务必在当前目录中使用CUDA by Example书中的book.h。来源链接:http://developer.nvidia.com/cuda-example-introduction-general-purpose-gpu-programming 下载源代码,book.h将在公共目录/文件夹下。
答案 0 :(得分:5)
你的内核看起来很糟糕。线程本地搜索(在共享内存减少之前)应该如下所示:
sdata[tid] = g_idata[i];
i += gridSize;
while (i < n) {
sdata[tid] = MaxOf2(sdata[tid], g_idata[i]);
i += gridSize;
}
不应该吗?
另请注意,如果在Fermi上运行此命令,共享内存缓冲区应声明为volatile,如果线程本地搜索使用寄存器变量而不是共享内存,则性能会有明显改善。两者之间的有效带宽差异大约为8倍。
编辑:这是缩减内核的简化工作版本。您应该注意到与原始版本相比的许多差异:
__global__ void MaxReduction(int n, float *g_idata, float *g_odata)
{
extern __shared__ volatile float sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(BLOCKSIZE) + tid;
unsigned int gridSize = BLOCKSIZE*gridDim.x;
float val = g_idata[i];
i += gridSize;
while (i < n) {
val = MaxOf2(g_idata[i],val);
i += gridSize;
}
sdata[tid] = val;
__syncthreads();
// This versions uses a single warp for the shared memory
// reduction
# pragma unroll
for(int i=(tid+32); ((tid<32)&&(i<BLOCKSIZE)); i+=32)
sdata[tid] = MaxOf2(sdata[tid], sdata[i]);
if (tid < 16) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 16]);
if (tid < 8) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 8]);
if (tid < 4) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 4]);
if (tid < 2) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 2]);
if (tid == 0) g_odata[blockIdx.x] = MaxOf2(sdata[tid], sdata[tid + 1]);
}
此代码在Fermi上也应该是安全的。您还应该熟悉CUDA数学库,因为您应该使用fmax(x,y)
内在函数来代替MaxOf2
函数。