我对CUDA编程很陌生,并且一直在尝试实施Harris博士的并行简化优化示例(Link to slides),并且迄今为止已经成功(?)。
我的问题是我意识到当我启动我的内核时,在循环内部我在某种程度上使用第一个网格大小(它是总元素/线程)在它改变之前连续两次运行。这导致了两个问题:
我得到的时间远远超过预期(第一个内核为44毫秒,而预计接近22个),相比计算大约18毫秒的CPU计算时间为。
我从未使用1个块启动内核。
然而,我得到了GPU和CPU之和的精确匹配,从而告诉我计算已经正确完成。
我已经将内核1和2(用于测试)的配置更改为使用在使用1个块的最后时间启动内核时使用的余数的新配置。
我的问题是虽然GPUArray [0] = CPUsum,这是我所期望的,这次我每次看到GPUArray的其余值而不是索引0时,我仍然得到实际数字(我期望在哪里看到0),这与之前的配置没有发生。
以下是使用新配置运行内核2的结果示例:
MENU
Kernel 1 - Interleaved addressing with divergent branching
Kernel 2 - Interleaved addressing with bank conflicts
Kernel 3 - Sequential addressing
Kernel 4 - First add during global load
Kernel 5 - Unroll last warp
Kernel 6 - Complete Unroll
Kernel 7 - Multiple elements per thread
Command: 2
Running Kernel 2...
Blocks:32768...
Blocks:256... Remainder:0
Blocks:2... Remainder:0
Blocks:0... Remainder:2
CPU computation complete in 18.886805 ms
GPU computation complete in 19.395136 ms
HostArray is the pre initialized array. GPUArr is the resulting output array from the kernel.
CPU sum: 99682546 | GPUArr[0]=99682546
CPU sum: 99682546 | GPUArr[1]=-1844595429
Resetting CUDA device
.....finish
我的问题是为什么会发生这种情况,我该如何解决?我很好奇为什么它实际上显示与CPU计算相同的结果。
这是一个完整的可编译代码,共包含3个文件(reduction.cu,ReductionKernels.cu和ReductionKernels.h)。内核1和2使用新的启动配置完成,3以后使用旧启动进行比较和参考。我正在使用Quadro k600卡并使用128个线程。在Nsight eclipse版本中运行。
reduction.cu
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
#include "ReductionKernels.h"
#define HANDLE_ERROR handleError();
//extern "C"
void handleError();
double cpuSec();
int main(int argc, char** argv) {
int total_elements = 1 << 22;
size_t datasize = total_elements * sizeof(int);
int size = total_elements;
// Allocate the input data
int *h_idata = (int *) malloc(datasize);
int *h_odata = (int *) malloc(datasize); //holds the output data
for (int i = 0; i < total_elements; ++i) {
h_idata[i] = rand() % 10242024;
}
//allocate space on the device
int *g_idata = NULL;
cudaMalloc((void **) &g_idata, datasize);
int *g_odata = NULL;
cudaMalloc((void **) &g_odata, datasize);
int *temp = NULL;
cudaMalloc((void **) &temp, datasize);
cudaMemcpy(g_idata, h_idata, datasize, cudaMemcpyHostToDevice);
cudaMemcpy(g_odata, h_odata, datasize, cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
float time;
//Create two events. Each will record the time
cudaEventCreate(&start);
cudaEventCreate(&stop);
//call kernel and time
int threadsPerBlock = 128;
int blocksPerGrid;
int remainder;
int c;
// print instructions
printf("\nMENU\n\n"
"Kernel 1 - Interleaved addressing with divergent branching\n"
"Kernel 2 - Interleaved addressing with bank conflicts \n"
"Kernel 3 - Sequential addressing \n"
"Kernel 4 - First add during global load \n"
"Kernel 5 - Unroll last warp\n"
"Kernel 6 - Complete Unroll\n"
"Kernel 7 - Multiple elements per thread\n\n");
// get command
printf("Command: ");
fflush(stdin);
scanf(" %d", &c);
cudaEventRecord(start, 0);
switch (c) {
case 1:
printf("\nRunning Kernel 1... \n");
blocksPerGrid = size / threadsPerBlock;
while (blocksPerGrid != 0) {
reduce0<<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
temp = g_odata;
g_odata = g_idata;
g_idata = temp;
remainder = blocksPerGrid % threadsPerBlock;
blocksPerGrid = blocksPerGrid / threadsPerBlock;
printf("\nBlocks:%d... Remainder:%d", blocksPerGrid, remainder);
}
reduce0<<<1, remainder, remainder * sizeof(int)>>>(g_idata, g_odata);
break;
case 2:
printf("\nRunning Kernel 2... \n");
blocksPerGrid = size / threadsPerBlock;
printf("\nBlocks:%d...", blocksPerGrid);
while (blocksPerGrid != 0) {
reducekernel2<<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
temp = g_odata;
g_odata = g_idata;
g_idata = temp;
remainder = blocksPerGrid % threadsPerBlock;
blocksPerGrid = blocksPerGrid / threadsPerBlock;
printf("\nBlocks:%d... Remainder:%d", blocksPerGrid, remainder);
}
reducekernel2<<<1, remainder, remainder * sizeof(int)>>>(g_idata,
g_odata);
break;
case 3:
printf("\nRunning Kernel 3... \n");
blocksPerGrid = size / threadsPerBlock;
while (blocksPerGrid > 1) {
if (size < threadsPerBlock) {
threadsPerBlock = size;
}
reducekernel3<<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
g_idata = g_odata;
blocksPerGrid = size / threadsPerBlock;
size = blocksPerGrid;
}
break;
case 4:
printf("\nRunning Kernel 4... \n");
blocksPerGrid = size / (threadsPerBlock * 2);
while (blocksPerGrid > 1) {
if (size < threadsPerBlock) {
threadsPerBlock = size;
}
reducekernel4<<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
g_idata = g_odata;
blocksPerGrid = size / (threadsPerBlock * 2);
size = blocksPerGrid;
}
break;
case 5:
printf("\nRunning Kernel 5... \n");
blocksPerGrid = size / (threadsPerBlock * 2);
size = blocksPerGrid;
printf("\nBlocks:%d... \n", blocksPerGrid);
while (blocksPerGrid > 0) {
if (size < threadsPerBlock) {
threadsPerBlock = size;
}
reducekernel5<<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
g_idata = g_odata;
blocksPerGrid = size / (threadsPerBlock * 2);
size = blocksPerGrid;
printf("\nBlocks:%d... \n", blocksPerGrid);
}
break;
case 6:
printf("\nRunning Kernel 6... \n");
blocksPerGrid = size / (threadsPerBlock * 2);
while (blocksPerGrid > 1) {
if (size < threadsPerBlock) {
threadsPerBlock = size;
}
switch (threadsPerBlock) {
case 512:
reducekernel6<512> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
break;
case 256:
reducekernel6<256> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
break;
case 128:
reducekernel6<128> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
break;
case 64:
reducekernel6<64> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
break;
case 32:
reducekernel6<32> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
break;
case 16:
reducekernel6<16> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
break;
case 8:
reducekernel6<8> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
break;
case 4:
reducekernel6<4> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
break;
case 2:
reducekernel6<2> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
break;
case 1:
reducekernel6<1> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
break;
}
g_idata = g_odata;
blocksPerGrid = size / (threadsPerBlock * 2);
size = blocksPerGrid;
}
break;
case 7:
printf("\nRunning Kernel 7... \n");
blocksPerGrid = (size / (threadsPerBlock * 2));
while ((blocksPerGrid > 64) && (threadsPerBlock >= 128)) { //limit to 64 blocks
blocksPerGrid = (blocksPerGrid / (threadsPerBlock * 2));
}
while (blocksPerGrid > 1) {
if (size < threadsPerBlock) {
threadsPerBlock = size;
}
switch (threadsPerBlock) {
case 512:
reducekernel7<512> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
size);
break;
case 256:
reducekernel7<256> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
size);
break;
case 128:
reducekernel7<128> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
size);
break;
case 64:
reducekernel7<64> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
size);
break;
case 32:
reducekernel7<32> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
size);
break;
case 16:
reducekernel7<16> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
size);
break;
case 8:
reducekernel7<8> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
size);
break;
case 4:
reducekernel7<4> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
size);
break;
case 2:
reducekernel7<2> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
size);
break;
case 1:
reducekernel7<1> <<<blocksPerGrid, threadsPerBlock,
threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
size);
break;
}
g_idata = g_odata;
blocksPerGrid = size / (threadsPerBlock * 2);
size = blocksPerGrid;
}
break;
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
//copy data from device to host
cudaMemcpy(h_odata, g_odata, datasize, cudaMemcpyDeviceToHost);
//cuda free
cudaFree(g_odata);
cudaFree(g_idata);
int sum;
double begin = cpuSec();
for (int j = 0; j < total_elements; ++j) {
sum += h_idata[j]; //sum of pre init array
}
cudaThreadSynchronize();
double diff = cpuSec() - begin; //count in seconds
printf("\nCPU computation complete in %f ms\n", diff * 1000);
printf("GPU computation complete in %f ms\n\n", time);
printf(
"HostArray is the pre initialized array. GPUArr is the resulting output array from the kernel. "
"\n\nCPU sum: %d | GPUArr[0]=%d \nCPU sum: %d | GPUArr[1]=%d\n",
sum, h_odata[0], sum, h_odata[1]);
printf("\nResetting CUDA device \n");
/*for(int i=0; i<128; ++i){
printf("GPUArr[%d]=%d\n",i,h_odata[i]);
}*/
cudaDeviceReset();
printf(".....finish\n");
return 0;
}
void handleError() {
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("Error: %s\n", cudaGetErrorString(err));
}
}
double cpuSec() {
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double) tp.tv_sec + (double) tp.tv_usec * 1.e-6);
}
ReductionKernels.h
#include <stdio.h>
__global__ void reduce0(int *g_idata, int*g_odata);
__global__ void reducekernel2(int *g_idata, int*g_odata);
__global__ void reducekernel3(int *g_idata, int*g_odata);
__global__ void reducekernel4(int *g_idata, int*g_odata);
__global__ void reducekernel5(int *g_idata, int*g_odata);
template<unsigned int blockSize> __global__ void reducekernel6(int *g_idata,
int*g_odata);
template<unsigned int blockSize> __global__ void reducekernel7(int *g_idata,
int*g_odata, unsigned int n);
template<unsigned int blockSize>
__global__ void reducekernel6(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads two elements from global to shared mem
// end performs the first step of the reduction
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
__syncthreads();
if (blockSize >= 1024) {
if (tid < 512) {
sdata[tid] += sdata[tid + 512];
}
__syncthreads();
}
if (blockSize >= 512) {
if (tid < 256) {
sdata[tid] += sdata[tid + 256];
}
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128) {
sdata[tid] += sdata[tid + 128];
}
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64) {
sdata[tid] += sdata[tid + 64];
}
__syncthreads();
}
if (tid < 32) {
if (blockSize >= 64)
sdata[tid] += sdata[tid + 32];
if (blockSize >= 32)
sdata[tid] += sdata[tid + 16];
if (blockSize >= 16)
sdata[tid] += sdata[tid + 8];
if (blockSize >= 8)
sdata[tid] += sdata[tid + 4];
if (blockSize >= 4)
sdata[tid] += sdata[tid + 2];
if (blockSize >= 2)
sdata[tid] += sdata[tid + 1];
}
//write result of this block to global memory
if (tid == 0) {
g_odata[blockIdx.x] = sdata[0];
}
}
template<unsigned int blockSize>
__global__ void reducekernel7(int *g_idata, int*g_odata, unsigned int n) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * (blockSize * 2) + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
sdata[tid] = 0;
while (i < n) {
sdata[tid] += g_idata[i] + g_idata[i + blockSize];
i += gridSize;
}
__syncthreads();
if (blockSize >= 512) {
if (tid < 256) {
sdata[tid] += sdata[tid + 256];
}
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128) {
sdata[tid] += sdata[tid + 128];
}
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64) {
sdata[tid] += sdata[tid + 64];
}
__syncthreads();
}
if (tid < 32) {
if (blockSize >= 64)
sdata[tid] += sdata[tid + 32];
if (blockSize >= 32)
sdata[tid] += sdata[tid + 16];
if (blockSize >= 16)
sdata[tid] += sdata[tid + 8];
if (blockSize >= 8)
sdata[tid] += sdata[tid + 4];
if (blockSize >= 4)
sdata[tid] += sdata[tid + 2];
if (blockSize >= 2)
sdata[tid] += sdata[tid + 1];
}
//write result of this block to global memory
if (tid == 0) {
g_odata[blockIdx.x] = sdata[0];
}
}
ReductionKernels.cu
#include "ReductionKernels.h"
__global__ void reduce0(int *g_idata, int*g_odata) {
extern __shared__ int sdata[];
//each thread loads one element from global to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
//do reduction in shared memory
for (unsigned int s = 1; s < blockDim.x; s *= 2) {
if ((tid & (2 * s - 1)) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
//write result of this block to global memory
if (tid == 0) {
g_odata[blockIdx.x] = sdata[0];
}
}
__global__ void reducekernel2(int *g_idata, int*g_odata) {
extern __shared__ int sdata[];
//each thread loads one element from global to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (unsigned int s = 1; s < blockDim.x; s *= 2) {
int index = 2 * s * tid;
if (index < blockDim.x) {
sdata[index] += sdata[index + s];
}
__syncthreads();
}
//write result of this block to global memory
if (tid == 0) {
g_odata[blockIdx.x] = sdata[0];
}
}
__global__ void reducekernel3(int *g_idata, int*g_odata) {
extern __shared__ int sdata[];
//each thread loads one element from global to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (unsigned int s = (blockDim.x>>1); s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
//write result of this block to global memory
if (tid == 0) {
g_odata[blockIdx.x] = sdata[0];
}
}
__global__ void reducekernel4(int *g_idata, int*g_odata) {
extern __shared__ int sdata[];
// each thread loads two elements from global to shared mem
// end performs the first step of the reduction
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
__syncthreads();
// do reduction in shared mem
for (unsigned int s = (blockDim.x>>1); s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
//write result of this block to global memory
if (tid == 0) {
g_odata[blockIdx.x] = sdata[0];
}
}
__global__ void reducekernel5(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads two elements from global to shared mem
// end performs the first step of the reduction
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
__syncthreads();
// do reduction in shared mem
for (unsigned int s = (blockDim.x>>1); s > 32; s >>= 1) {
if (tid < s)
sdata[tid] += sdata[tid + s];
__syncthreads();
}
if (tid < 32){
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
}
//write result of this block to global memory
if (tid == 0) {
g_odata[blockIdx.x] = sdata[0];
}
}