Question

我对CUDA编程很陌生，并且一直在尝试实施Harris博士的并行简化优化示例（Link to slides），并且迄今为止已经成功（？）。

我的问题是我意识到当我启动我的内核时，在循环内部我在某种程度上使用第一个网格大小（它是总元素/线程）在它改变之前连续两次运行。这导致了两个问题：

我得到的时间远远超过预期（第一个内核为44毫秒，而预计接近22个），相比计算大约18毫秒的CPU计算时间为。
我从未使用1个块启动内核。

然而，我得到了GPU和CPU之和的精确匹配，从而告诉我计算已经正确完成。

我已经将内核1和2（用于测试）的配置更改为使用在使用1个块的最后时间启动内核时使用的余数的新配置。

我的问题是虽然GPUArray [0] = CPUsum，这是我所期望的，这次我每次看到GPUArray的其余值而不是索引0时，我仍然得到实际数字（我期望在哪里看到0），这与之前的配置没有发生。

以下是使用新配置运行内核2的结果示例：

MENU

Kernel 1 - Interleaved addressing with divergent branching
Kernel 2 - Interleaved addressing with bank conflicts 
Kernel 3 - Sequential addressing 
Kernel 4 - First add during global load 
Kernel 5 - Unroll last warp
Kernel 6 - Complete Unroll
Kernel 7 - Multiple elements per thread

Command: 2

Running Kernel 2... 

Blocks:32768...
Blocks:256... Remainder:0
Blocks:2... Remainder:0
Blocks:0... Remainder:2
CPU computation complete in 18.886805 ms
GPU computation complete in 19.395136 ms

HostArray is the pre initialized array. GPUArr is the resulting output array from the kernel. 

CPU sum: 99682546 | GPUArr[0]=99682546  
CPU sum: 99682546 | GPUArr[1]=-1844595429

Resetting CUDA device 
.....finish

我的问题是为什么会发生这种情况，我该如何解决？我很好奇为什么它实际上显示与CPU计算相同的结果。

这是一个完整的可编译代码，共包含3个文件（reduction.cu，ReductionKernels.cu和ReductionKernels.h）。内核1和2使用新的启动配置完成，3以后使用旧启动进行比较和参考。我正在使用Quadro k600卡并使用128个线程。在Nsight eclipse版本中运行。

reduction.cu

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>

#include "ReductionKernels.h"

#define HANDLE_ERROR handleError();

//extern "C"

void handleError();
double cpuSec();

int main(int argc, char** argv) {

    int total_elements = 1 << 22;
    size_t datasize = total_elements * sizeof(int);
    int size = total_elements;

    // Allocate the input data
    int *h_idata = (int *) malloc(datasize);
    int *h_odata = (int *) malloc(datasize); //holds the output data

    for (int i = 0; i < total_elements; ++i) {
        h_idata[i] = rand() % 10242024;
    }

    //allocate space on the device
    int *g_idata = NULL;
    cudaMalloc((void **) &g_idata, datasize);

    int *g_odata = NULL;
    cudaMalloc((void **) &g_odata, datasize);

    int *temp = NULL;
    cudaMalloc((void **) &temp, datasize);

    cudaMemcpy(g_idata, h_idata, datasize, cudaMemcpyHostToDevice);
    cudaMemcpy(g_odata, h_odata, datasize, cudaMemcpyHostToDevice);

    cudaEvent_t start, stop;
    float time;
    //Create two events. Each will record the time
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    //call kernel and time
    int threadsPerBlock = 128;
    int blocksPerGrid;
    int remainder;
    int c;

    // print instructions
    printf("\nMENU\n\n"
            "Kernel 1 - Interleaved addressing with divergent branching\n"
            "Kernel 2 - Interleaved addressing with bank conflicts \n"
            "Kernel 3 - Sequential addressing \n"
            "Kernel 4 - First add during global load \n"
            "Kernel 5 - Unroll last warp\n"
            "Kernel 6 - Complete Unroll\n"
            "Kernel 7 - Multiple elements per thread\n\n");
    // get command
    printf("Command: ");
    fflush(stdin);
    scanf(" %d", &c);

    cudaEventRecord(start, 0);
    switch (c) {
    case 1:
        printf("\nRunning Kernel 1... \n");
        blocksPerGrid = size / threadsPerBlock;
        while (blocksPerGrid != 0) {
            reduce0<<<blocksPerGrid, threadsPerBlock,
                    threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);

            temp = g_odata;
            g_odata = g_idata;
            g_idata = temp;
            remainder = blocksPerGrid % threadsPerBlock;
            blocksPerGrid = blocksPerGrid / threadsPerBlock;
            printf("\nBlocks:%d... Remainder:%d", blocksPerGrid, remainder);
        }
        reduce0<<<1, remainder, remainder * sizeof(int)>>>(g_idata, g_odata);
        break;
    case 2:
        printf("\nRunning Kernel 2... \n");
        blocksPerGrid = size / threadsPerBlock;
        printf("\nBlocks:%d...", blocksPerGrid);
        while (blocksPerGrid != 0) {
            reducekernel2<<<blocksPerGrid, threadsPerBlock,
                    threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);

            temp = g_odata;
            g_odata = g_idata;
            g_idata = temp;
            remainder = blocksPerGrid % threadsPerBlock;
            blocksPerGrid = blocksPerGrid / threadsPerBlock;
            printf("\nBlocks:%d... Remainder:%d", blocksPerGrid, remainder);
        }
        reducekernel2<<<1, remainder, remainder * sizeof(int)>>>(g_idata,
                g_odata);
        break;
    case 3:
        printf("\nRunning Kernel 3... \n");
        blocksPerGrid = size / threadsPerBlock;
        while (blocksPerGrid > 1) {
            if (size < threadsPerBlock) {
                threadsPerBlock = size;
            }
            reducekernel3<<<blocksPerGrid, threadsPerBlock,
                    threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
            g_idata = g_odata;
            blocksPerGrid = size / threadsPerBlock;
            size = blocksPerGrid;
        }
        break;
    case 4:
        printf("\nRunning Kernel 4... \n");
        blocksPerGrid = size / (threadsPerBlock * 2);
        while (blocksPerGrid > 1) {
            if (size < threadsPerBlock) {
                threadsPerBlock = size;
            }
            reducekernel4<<<blocksPerGrid, threadsPerBlock,
                    threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
            g_idata = g_odata;
            blocksPerGrid = size / (threadsPerBlock * 2);
            size = blocksPerGrid;
        }
        break;
    case 5:
        printf("\nRunning Kernel 5... \n");
        blocksPerGrid = size / (threadsPerBlock * 2);
        size = blocksPerGrid;
        printf("\nBlocks:%d... \n", blocksPerGrid);
        while (blocksPerGrid > 0) {
            if (size < threadsPerBlock) {
                threadsPerBlock = size;
            }
            reducekernel5<<<blocksPerGrid, threadsPerBlock,
                    threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
            g_idata = g_odata;

            blocksPerGrid = size / (threadsPerBlock * 2);
            size = blocksPerGrid;
            printf("\nBlocks:%d... \n", blocksPerGrid);
        }
        break;
    case 6:
        printf("\nRunning Kernel 6... \n");
        blocksPerGrid = size / (threadsPerBlock * 2);
        while (blocksPerGrid > 1) {
            if (size < threadsPerBlock) {
                threadsPerBlock = size;
            }
            switch (threadsPerBlock) {
            case 512:
                reducekernel6<512> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                break;
            case 256:
                reducekernel6<256> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                break;
            case 128:
                reducekernel6<128> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                break;
            case 64:
                reducekernel6<64> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                break;
            case 32:
                reducekernel6<32> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                break;
            case 16:
                reducekernel6<16> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                break;
            case 8:
                reducekernel6<8> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                break;
            case 4:
                reducekernel6<4> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                break;
            case 2:
                reducekernel6<2> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                break;
            case 1:
                reducekernel6<1> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata);
                break;
            }
            g_idata = g_odata;
            blocksPerGrid = size / (threadsPerBlock * 2);
            size = blocksPerGrid;
        }
        break;
    case 7:
        printf("\nRunning Kernel 7... \n");
        blocksPerGrid = (size / (threadsPerBlock * 2));

        while ((blocksPerGrid > 64) && (threadsPerBlock >= 128)) { //limit to 64 blocks
            blocksPerGrid = (blocksPerGrid / (threadsPerBlock * 2));
        }

        while (blocksPerGrid > 1) {
            if (size < threadsPerBlock) {
                threadsPerBlock = size;
            }
            switch (threadsPerBlock) {
            case 512:
                reducekernel7<512> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                        size);
                break;
            case 256:
                reducekernel7<256> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                        size);
                break;
            case 128:
                reducekernel7<128> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                        size);
                break;
            case 64:
                reducekernel7<64> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                        size);
                break;
            case 32:
                reducekernel7<32> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                        size);
                break;
            case 16:
                reducekernel7<16> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                        size);
                break;
            case 8:
                reducekernel7<8> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                        size);
                break;
            case 4:
                reducekernel7<4> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                        size);
                break;
            case 2:
                reducekernel7<2> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                        size);
                break;
            case 1:
                reducekernel7<1> <<<blocksPerGrid, threadsPerBlock,
                        threadsPerBlock * sizeof(int)>>>(g_idata, g_odata,
                        size);
                break;
            }
            g_idata = g_odata;
            blocksPerGrid = size / (threadsPerBlock * 2);
            size = blocksPerGrid;
        }
        break;
    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    cudaEventElapsedTime(&time, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    //copy data from device to host
    cudaMemcpy(h_odata, g_odata, datasize, cudaMemcpyDeviceToHost);

    //cuda free
    cudaFree(g_odata);
    cudaFree(g_idata);

    int sum;
    double begin = cpuSec();
    for (int j = 0; j < total_elements; ++j) {
        sum += h_idata[j]; //sum of pre init array
    }
    cudaThreadSynchronize();
    double diff = cpuSec() - begin; //count in seconds

    printf("\nCPU computation complete in %f ms\n", diff * 1000);

    printf("GPU computation complete in %f ms\n\n", time);
    printf(
            "HostArray is the pre initialized array. GPUArr is the resulting output array from the kernel. "
                    "\n\nCPU sum: %d | GPUArr[0]=%d  \nCPU sum: %d | GPUArr[1]=%d\n",
            sum, h_odata[0], sum, h_odata[1]);
    printf("\nResetting CUDA device \n");

    /*for(int i=0; i<128; ++i){
     printf("GPUArr[%d]=%d\n",i,h_odata[i]);
     }*/

    cudaDeviceReset();

    printf(".....finish\n");

    return 0;
}

void handleError() {
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("Error: %s\n", cudaGetErrorString(err));
    }
}

double cpuSec() {
    struct timeval tp;
    gettimeofday(&tp, NULL);
    return ((double) tp.tv_sec + (double) tp.tv_usec * 1.e-6);
}

ReductionKernels.h

#include <stdio.h>

__global__ void reduce0(int *g_idata, int*g_odata);
__global__ void reducekernel2(int *g_idata, int*g_odata);
__global__ void reducekernel3(int *g_idata, int*g_odata);
__global__ void reducekernel4(int *g_idata, int*g_odata);
__global__ void reducekernel5(int *g_idata, int*g_odata);
template<unsigned int blockSize> __global__ void reducekernel6(int *g_idata,
        int*g_odata);
template<unsigned int blockSize> __global__ void reducekernel7(int *g_idata,
        int*g_odata, unsigned int n);

template<unsigned int blockSize>
__global__ void reducekernel6(int *g_idata, int *g_odata) {
    extern __shared__ int sdata[];

    // each thread loads two elements from global to shared mem
    // end performs the first step of the reduction
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
    sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
    __syncthreads();

    if (blockSize >= 1024) {
        if (tid < 512) {
            sdata[tid] += sdata[tid + 512];
        }
        __syncthreads();
    }
    if (blockSize >= 512) {
        if (tid < 256) {
            sdata[tid] += sdata[tid + 256];
        }
        __syncthreads();
    }
    if (blockSize >= 256) {
        if (tid < 128) {
            sdata[tid] += sdata[tid + 128];
        }
        __syncthreads();
    }
    if (blockSize >= 128) {
        if (tid < 64) {
            sdata[tid] += sdata[tid + 64];
        }
        __syncthreads();
    }
    if (tid < 32) {
        if (blockSize >= 64)
            sdata[tid] += sdata[tid + 32];
        if (blockSize >= 32)
            sdata[tid] += sdata[tid + 16];
        if (blockSize >= 16)
            sdata[tid] += sdata[tid + 8];
        if (blockSize >= 8)
            sdata[tid] += sdata[tid + 4];
        if (blockSize >= 4)
            sdata[tid] += sdata[tid + 2];
        if (blockSize >= 2)
            sdata[tid] += sdata[tid + 1];
    }
    //write result of this block to global memory
    if (tid == 0) {
        g_odata[blockIdx.x] = sdata[0];
    }
}

template<unsigned int blockSize>
__global__ void reducekernel7(int *g_idata, int*g_odata, unsigned int n) {
    extern __shared__ int sdata[];

    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * (blockSize * 2) + threadIdx.x;
    unsigned int gridSize = blockSize * 2 * gridDim.x;
    sdata[tid] = 0;
    while (i < n) {
        sdata[tid] += g_idata[i] + g_idata[i + blockSize];
        i += gridSize;
    }
    __syncthreads();

    if (blockSize >= 512) {
        if (tid < 256) {
            sdata[tid] += sdata[tid + 256];
        }
        __syncthreads();
    }
    if (blockSize >= 256) {
        if (tid < 128) {
            sdata[tid] += sdata[tid + 128];
        }
        __syncthreads();
    }
    if (blockSize >= 128) {
        if (tid < 64) {
            sdata[tid] += sdata[tid + 64];
        }
        __syncthreads();
    }
    if (tid < 32) {
        if (blockSize >= 64)
            sdata[tid] += sdata[tid + 32];
        if (blockSize >= 32)
            sdata[tid] += sdata[tid + 16];
        if (blockSize >= 16)
            sdata[tid] += sdata[tid + 8];
        if (blockSize >= 8)
            sdata[tid] += sdata[tid + 4];
        if (blockSize >= 4)
            sdata[tid] += sdata[tid + 2];
        if (blockSize >= 2)
            sdata[tid] += sdata[tid + 1];
    }
    //write result of this block to global memory
    if (tid == 0) {
        g_odata[blockIdx.x] = sdata[0];
    }
}

ReductionKernels.cu

#include "ReductionKernels.h"


__global__ void reduce0(int *g_idata, int*g_odata) {
    extern __shared__ int sdata[];

    //each thread loads one element from global to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    sdata[tid] = g_idata[i];
    __syncthreads();

    //do reduction in shared memory
    for (unsigned int s = 1; s < blockDim.x; s *= 2) {
        if ((tid & (2 * s - 1)) == 0) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    //write result of this block to global memory
    if (tid == 0) {
        g_odata[blockIdx.x] = sdata[0];
    }

}

__global__ void reducekernel2(int *g_idata, int*g_odata) {
    extern __shared__ int sdata[];

    //each thread loads one element from global to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    sdata[tid] = g_idata[i];
    __syncthreads();

    // do reduction in shared mem
    for (unsigned int s = 1; s < blockDim.x; s *= 2) {
        int index = 2 * s * tid;

        if (index < blockDim.x) {
            sdata[index] += sdata[index + s];
        }
        __syncthreads();
    }

    //write result of this block to global memory
    if (tid == 0) {
        g_odata[blockIdx.x] = sdata[0];
    }

}

__global__ void reducekernel3(int *g_idata, int*g_odata) {
    extern __shared__ int sdata[];

    //each thread loads one element from global to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    sdata[tid] = g_idata[i];
    __syncthreads();

    // do reduction in shared mem
    for (unsigned int s = (blockDim.x>>1); s > 0; s >>= 1) {

        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    //write result of this block to global memory
    if (tid == 0) {
        g_odata[blockIdx.x] = sdata[0];
    }

}

__global__ void reducekernel4(int *g_idata, int*g_odata) {
    extern __shared__ int sdata[];

    // each thread loads two elements from global to shared mem
    // end performs the first step of the reduction
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
    sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
    __syncthreads();

    // do reduction in shared mem
    for (unsigned int s = (blockDim.x>>1); s > 0; s >>= 1) {

        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    //write result of this block to global memory
    if (tid == 0) {
        g_odata[blockIdx.x] = sdata[0];
    }
}

__global__ void reducekernel5(int *g_idata, int *g_odata) {
    extern __shared__ int sdata[];

    // each thread loads two elements from global to shared mem
    // end performs the first step of the reduction
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
    sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
    __syncthreads();

    // do reduction in shared mem
    for (unsigned int s = (blockDim.x>>1); s > 32; s >>= 1) {
        if (tid < s)
            sdata[tid] += sdata[tid + s];
        __syncthreads();
    }
    if (tid < 32){
        sdata[tid] += sdata[tid + 32];
        sdata[tid] += sdata[tid + 16];
        sdata[tid] += sdata[tid + 8];
        sdata[tid] += sdata[tid + 4];
        sdata[tid] += sdata[tid + 2];
        sdata[tid] += sdata[tid + 1];
}

    //write result of this block to global memory
    if (tid == 0) {
        g_odata[blockIdx.x] = sdata[0];
    }
}

CUDA并行缩减（加法示例）

0 个答案: