最大GPU内核功能仅适用于一个块

时间:2013-05-18 03:34:26

标签: cuda

using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>

const int threadsPerBlock = 256;
const int N = 40000;

void generateArray(double *data, int count) {
    for (int i = 0; i < count; i++)
        data[i] = rand() / ((rand() + rand()) / 2.0 + 1);
}

double maxCPU(double *arr, int count) {

    int max = arr[0];

    for (int i = 0; i < count; i++)
        if (arr[i] > max)
            max = arr[i];
    return max;
}

__global__ void MaxGPU(double *a, int count, double *result){

    __shared__ double cache[threadsPerBlock];

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int cacheIndex = threadIdx.x;

    int temp = a[tid];
    tid+= blockDim.x * gridDim.x;

    while(tid < count){
        if(a[tid] > temp)
            temp = a[tid];
        tid+= blockDim.x * gridDim.x;
    }

    cache[cacheIndex] = temp;

    __syncthreads();

    int i = blockDim.x/2;
    while(i!=0){
        if(cacheIndex < i)
            if(cache[cacheIndex + i] > cache[cacheIndex])
                cache[cacheIndex] = cache[cacheIndex + i];
        __syncthreads();
        i/=2;
    }

    if(cacheIndex == 0)
        result[blockIdx.x] = cache[0];
}

int main(void) {
    double *arr = new double[N], resultGPU;
    generateArray(arr, N);
    double *devA, *dev_partial_result;

    double resultCPU = maxCPU(arr, N);

    cudaMalloc((void**)&devA, N * sizeof(double));
    cudaMalloc((void**)&dev_partial_result, 512 * sizeof(double));

    cudaMemcpy(devA, arr, N * sizeof(double), cudaMemcpyHostToDevice);

    MaxGPU<<<1, 256>>>(devA, N, dev_partial_result);

    cudaMemcpy(&resultGPU, dev_partial_result,sizeof(double), cudaMemcpyDeviceToHost);

    cout << "Max CPU: " << resultCPU << endl;
    cout << "Max GPU: " << resultGPU << endl;

    cudaFree(devA);
    cudaFree(dev_partial_result);

    delete [] arr;
    return 0;
}

我写了上面的代码。我不是为什么,但它只适用于一个块。它不适用于256或512块。为什么?怎么了?

1 个答案:

答案 0 :(得分:1)

尝试更改

double  resultGPU; to
double* resultGPU = new double[blocks_count];

cudaMemcpy(&resultGPU, dev_partial_result,sizeof(double), cudaMemcpyDeviceToHost); to
cudaMemcpy(resultGPU, dev_partial_result,blocks_count*sizeof(double), cudaMemcpyDeviceToHost);