从给定的向量中通过CUDA中的简化方法找到最大值及其索引

时间:2014-03-22 04:28:07

标签: cuda parallel-processing

我是CUDA的新手,因为向量找到最大值及其索引我使用CUDA

这是我的代码:

#include < cuda.h >
#include < stdio.h >
#include < time.h >
#include <iostream>
using namespace std;
#define tbp 256
#define nblocks 1
__global__ void kernel_max(int *a, int *d, int *index,int *idx)
{
    __shared__ int sdata[tbp]; //"static" shared memory

    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    sdata[tid] = a[i];
    index[tid] = i;
    __syncthreads();
    for(int s=tbp/2 ; s >= 1 ; s=s/2)
    {
        if(tid < s)
        {
            if(sdata[tid] < sdata[tid + s])
            {
                sdata[tid] = sdata[tid + s];
                index[tid] = index[tid+s];
             __syncthreads();
            }
            __syncthreads();
        }
        __syncthreads();
    }
    __syncthreads();
    if(tid == 0 ) 
    {
        d[blockIdx.x] = sdata[0];
        idx[blockIdx.x] = index[0];
    }
    __syncthreads();
}

int main()
{
    int i;
    const int N=tbp*nblocks;
    srand(time(NULL));

    int *a;
    a = (int*)malloc(N * sizeof(int));
    int *d;
    d = (int*)malloc(nblocks * sizeof(int));
    int *index;
    index = (int*)malloc(N * sizeof(int));
    int *idx;
    idx = (int*)malloc(nblocks * sizeof(int));

    int *dev_a, *dev_d, *dev_index,*dev_idx;

    cudaMalloc((void **) &dev_a, N*sizeof(int));
    cudaMalloc((void **) &dev_d, nblocks*sizeof(int));
    cudaMalloc((void **) &dev_index, N*sizeof(int));
    cudaMalloc((void **) &dev_idx, nblocks*sizeof(int));

    int mmm=0;
    int ddd=0;
    for( i = 0 ; i < N ; i++)
    {
        a[i] = rand()% 100 + 5;
        index[i]=i;
        //printf("%d\n",a[i]);
        if(mmm<a[i]) 
        {
            mmm=a[i];
            ddd=i;  
        }
    }
    printf("");
    printf("");
    printf("");
    printf("");
    cudaMemcpy(dev_a , a, N*sizeof(int),cudaMemcpyHostToDevice);
    cudaMemcpy(dev_index , index, N*sizeof(int),cudaMemcpyHostToDevice);
    kernel_max <<< nblocks,tbp >>>(dev_a,dev_d,dev_index,dev_idx);

    cudaMemcpy(d, dev_d, nblocks*sizeof(int),cudaMemcpyDeviceToHost);
    cudaMemcpy(index, dev_index, N*sizeof(int),cudaMemcpyDeviceToHost);
    cudaMemcpy(idx, dev_idx, nblocks*sizeof(int),cudaMemcpyDeviceToHost);

    printf("cpu max= %d, gpu_max = %d ,cpu index: %d, gpu index: %d",mmm,d[0],ddd,idx[0]);
    printf("\n");

    if(ddd!=idx[0])
    {
        cout<<"index mismatch!damn!!"<<endl;
    }
    else
    {
        cout<<"congratulations!!"<<endl;
    }
    /*
    for(i=0;i<N;i++)
        cout<<*(index+i)<<endl;
    */
    cudaFree(dev_a);
    cudaFree(dev_d);
    cudaFree(dev_index);
    cudaFree(dev_idx);

    free(a);
    free(d);
    free(index);
    free(idx);

    return 0;
}

问题是对于tbp&lt; 128它可以在值和索引中获得正确的结果 当增加到256,512,1024时,结果有时会出错。 任何人都可以解释这种情况吗?谢谢。

1 个答案:

答案 0 :(得分:1)

使用另一个循环来处理索引,以避免在此计算中具有不同索引问题的相同最大值

int temp=0;
for(i=0;i<tbp;i++)
{ 
    if(d[blockIdx.x]==a[i] && temp==0)
    {temp = i;}
} 
idx[0] = temp;