我是CUDA的新手,因为向量找到最大值及其索引我使用CUDA
这是我的代码:
#include < cuda.h >
#include < stdio.h >
#include < time.h >
#include <iostream>
using namespace std;
#define tbp 256
#define nblocks 1
__global__ void kernel_max(int *a, int *d, int *index,int *idx)
{
__shared__ int sdata[tbp]; //"static" shared memory
int tid = threadIdx.x;
int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = a[i];
index[tid] = i;
__syncthreads();
for(int s=tbp/2 ; s >= 1 ; s=s/2)
{
if(tid < s)
{
if(sdata[tid] < sdata[tid + s])
{
sdata[tid] = sdata[tid + s];
index[tid] = index[tid+s];
__syncthreads();
}
__syncthreads();
}
__syncthreads();
}
__syncthreads();
if(tid == 0 )
{
d[blockIdx.x] = sdata[0];
idx[blockIdx.x] = index[0];
}
__syncthreads();
}
int main()
{
int i;
const int N=tbp*nblocks;
srand(time(NULL));
int *a;
a = (int*)malloc(N * sizeof(int));
int *d;
d = (int*)malloc(nblocks * sizeof(int));
int *index;
index = (int*)malloc(N * sizeof(int));
int *idx;
idx = (int*)malloc(nblocks * sizeof(int));
int *dev_a, *dev_d, *dev_index,*dev_idx;
cudaMalloc((void **) &dev_a, N*sizeof(int));
cudaMalloc((void **) &dev_d, nblocks*sizeof(int));
cudaMalloc((void **) &dev_index, N*sizeof(int));
cudaMalloc((void **) &dev_idx, nblocks*sizeof(int));
int mmm=0;
int ddd=0;
for( i = 0 ; i < N ; i++)
{
a[i] = rand()% 100 + 5;
index[i]=i;
//printf("%d\n",a[i]);
if(mmm<a[i])
{
mmm=a[i];
ddd=i;
}
}
printf("");
printf("");
printf("");
printf("");
cudaMemcpy(dev_a , a, N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dev_index , index, N*sizeof(int),cudaMemcpyHostToDevice);
kernel_max <<< nblocks,tbp >>>(dev_a,dev_d,dev_index,dev_idx);
cudaMemcpy(d, dev_d, nblocks*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(index, dev_index, N*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(idx, dev_idx, nblocks*sizeof(int),cudaMemcpyDeviceToHost);
printf("cpu max= %d, gpu_max = %d ,cpu index: %d, gpu index: %d",mmm,d[0],ddd,idx[0]);
printf("\n");
if(ddd!=idx[0])
{
cout<<"index mismatch!damn!!"<<endl;
}
else
{
cout<<"congratulations!!"<<endl;
}
/*
for(i=0;i<N;i++)
cout<<*(index+i)<<endl;
*/
cudaFree(dev_a);
cudaFree(dev_d);
cudaFree(dev_index);
cudaFree(dev_idx);
free(a);
free(d);
free(index);
free(idx);
return 0;
}
问题是对于tbp&lt; 128它可以在值和索引中获得正确的结果 当增加到256,512,1024时,结果有时会出错。 任何人都可以解释这种情况吗?谢谢。
答案 0 :(得分:1)
使用另一个循环来处理索引,以避免在此计算中具有不同索引问题的相同最大值
int temp=0;
for(i=0;i<tbp;i++)
{
if(d[blockIdx.x]==a[i] && temp==0)
{temp = i;}
}
idx[0] = temp;