CUDA并行缩减(加法示例)

时间:2016-03-15 19:47:58

标签: c++ c cuda nvidia

我对CUDA编程很陌生,并且一直在尝试实施Harris博士的并行简化优化示例(Link to slides),并且迄今为止已经成功(?)。

我的问题是我意识到当我启动我的内核时,在循环内部我在某种程度上使用第一个网格大小(它是总元素/线程)在它改变之前连续两次运行。这导致了两个问题:

  1. 我得到的时间远远超过预期(第一个内核为44毫秒,而预计接近22个),相比计算大约18毫秒的CPU计算时间为。

  2. 我从未使用1个块启动内核。

  3. 然而,我得到了GPU和CPU之和的精确匹配,从而告诉我计算已经正确完成。

    我已经将内核1和2(用于测试)的配置更改为使用在使用1个块的最后时间启动内核时使用的余数的新配置。

    我的问题是虽然GPUArray [0] = CPUsum,这是我所期望的,这次我每次看到GPUArray的其余值而不是索引0时,我仍然得到实际数字(我期望在哪里看到0),这与之前的配置没有发生。

    以下是使用新配置运行内核2的结果示例:

    MENU
    
    Kernel 1 - Interleaved addressing with divergent branching
    Kernel 2 - Interleaved addressing with bank conflicts 
    Kernel 3 - Sequential addressing 
    Kernel 4 - First add during global load 
    Kernel 5 - Unroll last warp
    Kernel 6 - Complete Unroll
    Kernel 7 - Multiple elements per thread
    
    Command: 2
    
    Running Kernel 2... 
    
    Blocks:32768...
    Blocks:256... Remainder:0
    Blocks:2... Remainder:0
    Blocks:0... Remainder:2
    CPU computation complete in 18.886805 ms
    GPU computation complete in 19.395136 ms
    
    HostArray is the pre initialized array. GPUArr is the resulting output array from the kernel. 
    
    CPU sum: 99682546 | GPUArr[0]=99682546  
    CPU sum: 99682546 | GPUArr[1]=-1844595429
    
    Resetting CUDA device 
    .....finish
    

    我的问题是为什么会发生这种情况,我该如何解决?我很好奇为什么它实际上显示与CPU计算相同的结果。

    这是一个完整的可编译代码,共包含3个文件(reduction.cu,ReductionKernels.cu和ReductionKernels.h)。内核1和2使用新的启动配置完成,3以后使用旧启动进行比较和参考。我正在使用Quadro k600卡并使用128个线程。在Nsight eclipse版本中运行。

    reduction.cu

    #include <stdlib.h>
    #include <stdio.h>
    #include <string.h>
    #include <math.h>
    #include <sys/time.h>
    
    #include "ReductionKernels.h"
    
    #define HANDLE_ERROR handleError();
    
    //extern "C"
    
    void handleError();
    double cpuSec();
    
    int main(int argc, char** argv) {
    
        int total_elements = 1 << 22;
        size_t datasize = total_elements * sizeof(int);
        int size = total_elements;
    
        // Allocate the input data
        int *h_idata = (int *) malloc(datasize);
        int *h_odata = (int *) malloc(datasize); //holds the output data
    
        for (int i = 0; i < total_elements; ++i) {
            h_idata[i] = rand() % 10242024;
        }
    
        //allocate space on the device
        int *g_idata = NULL;
        cudaMalloc((void **) &g_idata, datasize);
    
        int *g_odata = NULL;
        cudaMalloc((void **) &g_odata, datasize);
    
        int *temp = NULL;
        cudaMalloc((void **) &temp, datasize);
    
        cudaMemcpy(g_idata, h_idata, datasize, cudaMemcpyHostToDevice);
        cudaMemcpy(g_odata, h_odata, datasize, cudaMemcpyHostToDevice);
    
        cudaEvent_t start, stop;
        float time;
        //Create two events. Each will record the time
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
    
        //call kernel and time
        int threadsPerBlock = 128;
        int blocksPerGrid;
        int remainder;
        int c;
    
        // print instructions
        printf("\nMENU\n\n"
                "Kernel 1 - Interleaved addressing with divergent branching\n"
                "Kernel 2 - Interleaved addressing with bank conflicts \n"
                "Kernel 3 - Sequential addressing \n"
                "Kernel 4 - First add during global load \n"
                "Kernel 5 - Unroll last warp\n"
                "Kernel 6 - Complete Unroll\n"
                "Kernel 7 - Multiple elements per thread\n\n");
        // get command
        printf("Command: ");
        fflush(stdin);
        scanf(" %d", &c);
    
        cudaEventRecord(start, 0);
        switch (c) {
        case 1:
            printf("\nRunning Kernel 1... \n");
            blocksPerGrid = size / threadsPerBlock;
            while (blocksPerGrid != 0) {
                reduce0<<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
    
                temp = g_odata;
                g_odata = g_idata;
                g_idata = temp;
                remainder = blocksPerGrid % threadsPerBlock;
                blocksPerGrid = blocksPerGrid / threadsPerBlock;
                printf("\nBlocks:%d... Remainder:%d", blocksPerGrid, remainder);
            }
            reduce0<<<1, remainder, remainder * sizeof(int)>>>(g_idata, g_odata);
            break;
        case 2:
            printf("\nRunning Kernel 2... \n");
            blocksPerGrid = size / threadsPerBlock;
            printf("\nBlocks:%d...", blocksPerGrid);
            while (blocksPerGrid != 0) {
                reducekernel2<<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
    
                temp = g_odata;
                g_odata = g_idata;
                g_idata = temp;
                remainder = blocksPerGrid % threadsPerBlock;
                blocksPerGrid = blocksPerGrid / threadsPerBlock;
                printf("\nBlocks:%d... Remainder:%d", blocksPerGrid, remainder);
            }
            reducekernel2<<<1, remainder, remainder * sizeof(int)>>>(g_idata,
                    g_odata);
            break;
        case 3:
            printf("\nRunning Kernel 3... \n");
            blocksPerGrid = size / threadsPerBlock;
            while (blocksPerGrid > 1) {
                if (size < threadsPerBlock) {
                    threadsPerBlock = size;
                }
                reducekernel3<<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                g_idata = g_odata;
                blocksPerGrid = size / threadsPerBlock;
                size = blocksPerGrid;
            }
            break;
        case 4:
            printf("\nRunning Kernel 4... \n");
            blocksPerGrid = size / (threadsPerBlock * 2);
            while (blocksPerGrid > 1) {
                if (size < threadsPerBlock) {
                    threadsPerBlock = size;
                }
                reducekernel4<<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                g_idata = g_odata;
                blocksPerGrid = size / (threadsPerBlock * 2);
                size = blocksPerGrid;
            }
            break;
        case 5:
            printf("\nRunning Kernel 5... \n");
            blocksPerGrid = size / (threadsPerBlock * 2);
            size = blocksPerGrid;
            printf("\nBlocks:%d... \n", blocksPerGrid);
            while (blocksPerGrid > 0) {
                if (size < threadsPerBlock) {
                    threadsPerBlock = size;
                }
                reducekernel5<<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                g_idata = g_odata;
    
                blocksPerGrid = size / (threadsPerBlock * 2);
                size = blocksPerGrid;
                printf("\nBlocks:%d... \n", blocksPerGrid);
            }
            break;
        case 6:
            printf("\nRunning Kernel 6... \n");
            blocksPerGrid = size / (threadsPerBlock * 2);
            while (blocksPerGrid > 1) {
                if (size < threadsPerBlock) {
                    threadsPerBlock = size;
                }
                switch (threadsPerBlock) {
                case 512:
                    reducekernel6<512> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                    break;
                case 256:
                    reducekernel6<256> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                    break;
                case 128:
                    reducekernel6<128> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                    break;
                case 64:
                    reducekernel6<64> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                    break;
                case 32:
                    reducekernel6<32> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                    break;
                case 16:
                    reducekernel6<16> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                    break;
                case 8:
                    reducekernel6<8> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                    break;
                case 4:
                    reducekernel6<4> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                    break;
                case 2:
                    reducekernel6<2> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                    break;
                case 1:
                    reducekernel6<1> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                    break;
                }
                g_idata = g_odata;
                blocksPerGrid = size / (threadsPerBlock * 2);
                size = blocksPerGrid;
            }
            break;
        case 7:
            printf("\nRunning Kernel 7... \n");
            blocksPerGrid = (size / (threadsPerBlock * 2));
    
            while ((blocksPerGrid > 64) && (threadsPerBlock >= 128)) { //limit to 64 blocks
                blocksPerGrid = (blocksPerGrid / (threadsPerBlock * 2));
            }
    
            while (blocksPerGrid > 1) {
                if (size < threadsPerBlock) {
                    threadsPerBlock = size;
                }
                switch (threadsPerBlock) {
                case 512:
                    reducekernel7<512> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                            size);
                    break;
                case 256:
                    reducekernel7<256> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                            size);
                    break;
                case 128:
                    reducekernel7<128> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                            size);
                    break;
                case 64:
                    reducekernel7<64> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                            size);
                    break;
                case 32:
                    reducekernel7<32> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                            size);
                    break;
                case 16:
                    reducekernel7<16> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                            size);
                    break;
                case 8:
                    reducekernel7<8> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                            size);
                    break;
                case 4:
                    reducekernel7<4> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                            size);
                    break;
                case 2:
                    reducekernel7<2> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                            size);
                    break;
                case 1:
                    reducekernel7<1> <<<blocksPerGrid, threadsPerBlock,
                            threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                            size);
                    break;
                }
                g_idata = g_odata;
                blocksPerGrid = size / (threadsPerBlock * 2);
                size = blocksPerGrid;
            }
            break;
        }
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
    
        cudaEventElapsedTime(&time, start, stop);
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    
        //copy data from device to host
        cudaMemcpy(h_odata, g_odata, datasize, cudaMemcpyDeviceToHost);
    
        //cuda free
        cudaFree(g_odata);
        cudaFree(g_idata);
    
        int sum;
        double begin = cpuSec();
        for (int j = 0; j < total_elements; ++j) {
            sum += h_idata[j]; //sum of pre init array
        }
        cudaThreadSynchronize();
        double diff = cpuSec() - begin; //count in seconds
    
        printf("\nCPU computation complete in %f ms\n", diff * 1000);
    
        printf("GPU computation complete in %f ms\n\n", time);
        printf(
                "HostArray is the pre initialized array. GPUArr is the resulting output array from the kernel. "
                        "\n\nCPU sum: %d | GPUArr[0]=%d  \nCPU sum: %d | GPUArr[1]=%d\n",
                sum, h_odata[0], sum, h_odata[1]);
        printf("\nResetting CUDA device \n");
    
        /*for(int i=0; i<128; ++i){
         printf("GPUArr[%d]=%d\n",i,h_odata[i]);
         }*/
    
        cudaDeviceReset();
    
        printf(".....finish\n");
    
        return 0;
    }
    
    void handleError() {
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess) {
            printf("Error: %s\n", cudaGetErrorString(err));
        }
    }
    
    double cpuSec() {
        struct timeval tp;
        gettimeofday(&tp, NULL);
        return ((double) tp.tv_sec + (double) tp.tv_usec * 1.e-6);
    }
    

    ReductionKernels.h

    #include <stdio.h>
    
    __global__ void reduce0(int *g_idata, int*g_odata);
    __global__ void reducekernel2(int *g_idata, int*g_odata);
    __global__ void reducekernel3(int *g_idata, int*g_odata);
    __global__ void reducekernel4(int *g_idata, int*g_odata);
    __global__ void reducekernel5(int *g_idata, int*g_odata);
    template<unsigned int blockSize> __global__ void reducekernel6(int *g_idata,
            int*g_odata);
    template<unsigned int blockSize> __global__ void reducekernel7(int *g_idata,
            int*g_odata, unsigned int n);
    
    template<unsigned int blockSize>
    __global__ void reducekernel6(int *g_idata, int *g_odata) {
        extern __shared__ int sdata[];
    
        // each thread loads two elements from global to shared mem
        // end performs the first step of the reduction
        unsigned int tid = threadIdx.x;
        unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
        sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
        __syncthreads();
    
        if (blockSize >= 1024) {
            if (tid < 512) {
                sdata[tid] += sdata[tid + 512];
            }
            __syncthreads();
        }
        if (blockSize >= 512) {
            if (tid < 256) {
                sdata[tid] += sdata[tid + 256];
            }
            __syncthreads();
        }
        if (blockSize >= 256) {
            if (tid < 128) {
                sdata[tid] += sdata[tid + 128];
            }
            __syncthreads();
        }
        if (blockSize >= 128) {
            if (tid < 64) {
                sdata[tid] += sdata[tid + 64];
            }
            __syncthreads();
        }
        if (tid < 32) {
            if (blockSize >= 64)
                sdata[tid] += sdata[tid + 32];
            if (blockSize >= 32)
                sdata[tid] += sdata[tid + 16];
            if (blockSize >= 16)
                sdata[tid] += sdata[tid + 8];
            if (blockSize >= 8)
                sdata[tid] += sdata[tid + 4];
            if (blockSize >= 4)
                sdata[tid] += sdata[tid + 2];
            if (blockSize >= 2)
                sdata[tid] += sdata[tid + 1];
        }
        //write result of this block to global memory
        if (tid == 0) {
            g_odata[blockIdx.x] = sdata[0];
        }
    }
    
    template<unsigned int blockSize>
    __global__ void reducekernel7(int *g_idata, int*g_odata, unsigned int n) {
        extern __shared__ int sdata[];
    
        unsigned int tid = threadIdx.x;
        unsigned int i = blockIdx.x * (blockSize * 2) + threadIdx.x;
        unsigned int gridSize = blockSize * 2 * gridDim.x;
        sdata[tid] = 0;
        while (i < n) {
            sdata[tid] += g_idata[i] + g_idata[i + blockSize];
            i += gridSize;
        }
        __syncthreads();
    
        if (blockSize >= 512) {
            if (tid < 256) {
                sdata[tid] += sdata[tid + 256];
            }
            __syncthreads();
        }
        if (blockSize >= 256) {
            if (tid < 128) {
                sdata[tid] += sdata[tid + 128];
            }
            __syncthreads();
        }
        if (blockSize >= 128) {
            if (tid < 64) {
                sdata[tid] += sdata[tid + 64];
            }
            __syncthreads();
        }
        if (tid < 32) {
            if (blockSize >= 64)
                sdata[tid] += sdata[tid + 32];
            if (blockSize >= 32)
                sdata[tid] += sdata[tid + 16];
            if (blockSize >= 16)
                sdata[tid] += sdata[tid + 8];
            if (blockSize >= 8)
                sdata[tid] += sdata[tid + 4];
            if (blockSize >= 4)
                sdata[tid] += sdata[tid + 2];
            if (blockSize >= 2)
                sdata[tid] += sdata[tid + 1];
        }
        //write result of this block to global memory
        if (tid == 0) {
            g_odata[blockIdx.x] = sdata[0];
        }
    }
    

    ReductionKernels.cu

    #include "ReductionKernels.h"
    
    
    __global__ void reduce0(int *g_idata, int*g_odata) {
        extern __shared__ int sdata[];
    
        //each thread loads one element from global to shared memory
        unsigned int tid = threadIdx.x;
        unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
        sdata[tid] = g_idata[i];
        __syncthreads();
    
        //do reduction in shared memory
        for (unsigned int s = 1; s < blockDim.x; s *= 2) {
            if ((tid & (2 * s - 1)) == 0) {
                sdata[tid] += sdata[tid + s];
            }
            __syncthreads();
        }
    
        //write result of this block to global memory
        if (tid == 0) {
            g_odata[blockIdx.x] = sdata[0];
        }
    
    }
    
    __global__ void reducekernel2(int *g_idata, int*g_odata) {
        extern __shared__ int sdata[];
    
        //each thread loads one element from global to shared memory
        unsigned int tid = threadIdx.x;
        unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
        sdata[tid] = g_idata[i];
        __syncthreads();
    
        // do reduction in shared mem
        for (unsigned int s = 1; s < blockDim.x; s *= 2) {
            int index = 2 * s * tid;
    
            if (index < blockDim.x) {
                sdata[index] += sdata[index + s];
            }
            __syncthreads();
        }
    
        //write result of this block to global memory
        if (tid == 0) {
            g_odata[blockIdx.x] = sdata[0];
        }
    
    }
    
    __global__ void reducekernel3(int *g_idata, int*g_odata) {
        extern __shared__ int sdata[];
    
        //each thread loads one element from global to shared memory
        unsigned int tid = threadIdx.x;
        unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
        sdata[tid] = g_idata[i];
        __syncthreads();
    
        // do reduction in shared mem
        for (unsigned int s = (blockDim.x>>1); s > 0; s >>= 1) {
    
            if (tid < s) {
                sdata[tid] += sdata[tid + s];
            }
            __syncthreads();
        }
    
        //write result of this block to global memory
        if (tid == 0) {
            g_odata[blockIdx.x] = sdata[0];
        }
    
    }
    
    __global__ void reducekernel4(int *g_idata, int*g_odata) {
        extern __shared__ int sdata[];
    
        // each thread loads two elements from global to shared mem
        // end performs the first step of the reduction
        unsigned int tid = threadIdx.x;
        unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
        sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
        __syncthreads();
    
        // do reduction in shared mem
        for (unsigned int s = (blockDim.x>>1); s > 0; s >>= 1) {
    
            if (tid < s) {
                sdata[tid] += sdata[tid + s];
            }
            __syncthreads();
        }
    
        //write result of this block to global memory
        if (tid == 0) {
            g_odata[blockIdx.x] = sdata[0];
        }
    }
    
    __global__ void reducekernel5(int *g_idata, int *g_odata) {
        extern __shared__ int sdata[];
    
        // each thread loads two elements from global to shared mem
        // end performs the first step of the reduction
        unsigned int tid = threadIdx.x;
        unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
        sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
        __syncthreads();
    
        // do reduction in shared mem
        for (unsigned int s = (blockDim.x>>1); s > 32; s >>= 1) {
            if (tid < s)
                sdata[tid] += sdata[tid + s];
            __syncthreads();
        }
        if (tid < 32){
            sdata[tid] += sdata[tid + 32];
            sdata[tid] += sdata[tid + 16];
            sdata[tid] += sdata[tid + 8];
            sdata[tid] += sdata[tid + 4];
            sdata[tid] += sdata[tid + 2];
            sdata[tid] += sdata[tid + 1];
    }
    
        //write result of this block to global memory
        if (tid == 0) {
            g_odata[blockIdx.x] = sdata[0];
        }
    }
    

0 个答案:

没有答案