我写了一个代码,通过减少找到最小值。但结果总是为零。我不知道是什么问题。请帮我。
这是内核代码:我修改了Nvidia的减少代码。
#include <limits.h>
#define NumThread 128
#define NumBlock 32
__global__ void min_reduce(int* In, int* Out, int n){
__shared__ int sdata[NumThread];
unsigned int i = blockIdx.x * NumThread + threadIdx.x;
unsigned int tid = threadIdx.x;
unsigned int gridSize = NumBlock * NumThread;
int myMin = INT_MAX;
while (i < n){
if(In[i] < myMin)
myMin = In[i];
i += gridSize;
}
sdata[tid] = myMin;
__syncthreads();
if (NumThread >= 1024){
if (tid < 512)
if(sdata[tid] > sdata[tid + 512] ) sdata[tid] = sdata[tid + 512];
__syncthreads();
}
if (NumThread >= 512){
if(sdata[tid] > sdata[tid + 256] ) sdata[tid] = sdata[tid + 256];
__syncthreads();
}
if (NumThread >= 256){
if(sdata[tid] > sdata[tid + 128] && sdata[tid + 128] !=0) sdata[tid] = sdata[tid + 128];
__syncthreads();
}
if (NumThread >= 128){
if(sdata[tid] > sdata[tid + 64] ) sdata[tid] = sdata[tid + 64];
__syncthreads();
}
//the following practice is deprecated
if (tid < 32){
volatile int *smem = sdata;
if (NumThread >= 64) if(smem[tid] > smem[tid + 32] ) smem[tid] = smem[tid+32];
if (NumThread >= 32) if(smem[tid] > smem[tid + 16]) smem[tid] = smem[tid+16];
if (NumThread >= 16) if(smem[tid] > smem[tid + 8]) smem[tid] = smem[tid+8];
if (NumThread >= 8) if(smem[tid] > smem[tid + 4] ) smem[tid] = smem[tid+4];
if (NumThread >= 4) if(smem[tid] > smem[tid + 2] ) smem[tid] = smem[tid+2];
if (NumThread >= 2) if(smem[tid] > smem[tid + 1] ) smem[tid] = smem[tid+1];
}
if (tid == 0)
if(sdata[0] < sdata[1] ) Out[blockIdx.x] = sdata[0];
else Out[blockIdx.x] = sdata[1];
}
这是我的主要代码:
#include <stdio.h>
#include <stdlib.h>
#include "min_reduction.cu"
int main(int argc, char* argv[]){
unsigned int length = 1048576;
int i, Size, min;
int *a, *out, *gpuA, *gpuOut;
cudaSetDevice(0);
Size = length * sizeof(int);
a = (int*)malloc(Size);
out = (int*)malloc(NumBlock*sizeof(int));
for(i=0;i<length;i++) a[i] = (i + 10);
cudaMalloc((void**)&gpuA,Size);
cudaMalloc((void**)&gpuOut,NumBlock*sizeof(int));
cudaMemcpy(gpuA,a,Size,cudaMemcpyHostToDevice);
min_reduce<<<NumBlock,NumThread>>>(gpuA,gpuOut,length);
cudaDeviceSynchronize();
cudaMemcpy(out,gpuOut,NumBlock*sizeof(int),cudaMemcpyDeviceToHost);
min = out[0];
for(i=1;i<NumBlock;i++) if(min < out[i]) min = out[i];
return 0;
}
答案 0 :(得分:3)
我不确定我同意@HubertApplebaum所说的所有内容,但我同意使用proper cuda error checking的建议。正如您在代码中提到的那样,warp同步编程可以被认为是已弃用,但我不能支持声称它已经已破坏(尚未)。但是,我不想为此辩论;它不是你问题的核心。
另一个有用的调试建议是按照here步骤使用-lineinfo
编译代码并使用cuda-memcheck
运行代码。如果你这样做,你会看到很多这样的报告:
========= Invalid __shared__ read of size 4
========= at 0x000001e0 in /home/bob/misc/t1074.cu:39:min_reduce(int*, int*, int)
========= by thread (64,0,0) in block (24,0,0)
========= Address 0x00000200 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x15859d]
========= Host Frame:./t1074 [0x16dc1]
========= Host Frame:./t1074 [0x315d3]
========= Host Frame:./t1074 [0x28f5]
========= Host Frame:./t1074 [0x2623]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21d65]
========= Host Frame:./t1074 [0x271d]
这表明您的代码中的主要问题是您错误地索引到__shared__
内存数组以及正在发生的特定代码行。整齐! (在我的情况下它是第39行,但在你的情况下它可能是不同的一行)。如果您随后深入研究该行,您将需要研究这段代码:
#define NumThread 128
...
__shared__ int sdata[NumThread];
...
if (NumThread >= 128){
if(sdata[tid] > sdata[tid + 64] ) sdata[tid] = sdata[tid + 64]; //line 39 in my case
__syncthreads();
}
您已在128处定义NumThread
,并静态分配了多个int
数量的共享内存数组。一切都很好。 if语句中的代码怎么样? if-condition将被满足,这意味着块中的所有128个线程将执行该if语句的主体。但是,您正在从共享内存中读取sdata[tid + 64]
,对于tid
大于63的线程(即每个块中一半的线程),这将生成大于127的共享内存索引(这是越界的,即非法的。)
修复(对于您显示的特定代码)非常简单,只需添加另一个if-test:
if (NumThread >= 128){
if (tid < 64)
if(sdata[tid] > sdata[tid + 64] ) sdata[tid] = sdata[tid + 64];
__syncthreads();
}
如果您对代码进行了修改,并重新运行cuda-memcheck
测试,那么您将看到所有运行时报告的错误都消失了。耶!
但是代码仍然没有产生正确的答案。你在这里犯了另一个错误:
for(i=1;i<NumBlock;i++) if(min < out[i]) min = out[i];
如果你想找到最小值,并仔细考虑这个逻辑,你会发现你应该这样做:
for(i=1;i<NumBlock;i++) if(min > out[i]) min = out[i];
^
|
greater than
通过这两项更改,您的代码会为我生成正确的结果:
$ cat t1074.cu
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#define NumThread 128
#define NumBlock 32
__global__ void min_reduce(int* In, int* Out, int n){
__shared__ int sdata[NumThread];
unsigned int i = blockIdx.x * NumThread + threadIdx.x;
unsigned int tid = threadIdx.x;
unsigned int gridSize = NumBlock * NumThread;
int myMin = INT_MAX;
while (i < n){
if(In[i] < myMin)
myMin = In[i];
i += gridSize;
}
sdata[tid] = myMin;
__syncthreads();
if (NumThread >= 1024){
if (tid < 512)
if(sdata[tid] > sdata[tid + 512] ) sdata[tid] = sdata[tid + 512];
__syncthreads();
}
if (NumThread >= 512){
if(sdata[tid] > sdata[tid + 256] ) sdata[tid] = sdata[tid + 256];
__syncthreads();
}
if (NumThread >= 256){
if(sdata[tid] > sdata[tid + 128] && sdata[tid + 128] !=0) sdata[tid] = sdata[tid + 128];
__syncthreads();
}
if (NumThread >= 128){
if (tid < 64)
if(sdata[tid] > sdata[tid + 64] ) sdata[tid] = sdata[tid + 64];
__syncthreads();
}
//the following practice is deprecated
if (tid < 32){
volatile int *smem = sdata;
if (NumThread >= 64) if(smem[tid] > smem[tid + 32] ) smem[tid] = smem[tid+32];
if (NumThread >= 32) if(smem[tid] > smem[tid + 16]) smem[tid] = smem[tid+16];
if (NumThread >= 16) if(smem[tid] > smem[tid + 8]) smem[tid] = smem[tid+8];
if (NumThread >= 8) if(smem[tid] > smem[tid + 4] ) smem[tid] = smem[tid+4];
if (NumThread >= 4) if(smem[tid] > smem[tid + 2] ) smem[tid] = smem[tid+2];
if (NumThread >= 2) if(smem[tid] > smem[tid + 1] ) smem[tid] = smem[tid+1];
}
if (tid == 0)
if(sdata[0] < sdata[1] ) Out[blockIdx.x] = sdata[0];
else Out[blockIdx.x] = sdata[1];
}
int main(int argc, char* argv[]){
unsigned int length = 1048576;
int i, Size, min;
int *a, *out, *gpuA, *gpuOut;
cudaSetDevice(0);
Size = length * sizeof(int);
a = (int*)malloc(Size);
out = (int*)malloc(NumBlock*sizeof(int));
for(i=0;i<length;i++) a[i] = (i + 10);
a[10]=5;
cudaMalloc((void**)&gpuA,Size);
cudaMalloc((void**)&gpuOut,NumBlock*sizeof(int));
cudaMemcpy(gpuA,a,Size,cudaMemcpyHostToDevice);
min_reduce<<<NumBlock,NumThread>>>(gpuA,gpuOut,length);
cudaDeviceSynchronize();
cudaMemcpy(out,gpuOut,NumBlock*sizeof(int),cudaMemcpyDeviceToHost);
min = out[0];
for(i=1;i<NumBlock;i++) if(min > out[i]) min = out[i];
printf("min = %d\n", min);
return 0;
}
$ nvcc -o t1074 t1074.cu
$ cuda-memcheck ./t1074
========= CUDA-MEMCHECK
min = 5
========= ERROR SUMMARY: 0 errors
$
请注意,您已经在1024个线程的情况下进行了if-check,您可能希望在512和256线程的情况下添加适当的if-check,就像我为上面的128个线程添加它一样。< / p>