我编写了一个CUDA函数,用于计算2D中一组点中的凸包络。但它非常比CPU代码慢!
我正在使用warp vote函数和__syncronisation();很多次。那么这会使代码变慢吗?
由于
添加代码:
__global__ void find_edges_on_device(TYPE * h_x, TYPE * h_y, int *h_edges){
int tidX = threadIdx.x;
int tidY = threadIdx.y;
int tid = tidY*blockSizeX + tidX;
int i = threadIdx.x+blockIdx.x*blockDim.x;
int j = threadIdx.y+blockIdx.y*blockDim.y;
int hxi = h_x[i];
int hxj = h_x[j];
int hyi = h_y[i];
int hyj = h_y[j];
long scalarProduct = 0;
TYPE nx;
TYPE ny;
bool isValid = true;
__shared__ int shared_X[blockSizeX*blockSizeY];
__shared__ int shared_Y[blockSizeX*blockSizeY];
__shared__ bool iswarpvalid[32];
__shared__ bool isBlockValid;
if (tid==0)
{
isBlockValid=true;
}
if (tid<(blockSizeX*blockSizeY-1)/32+1)
{
iswarpvalid[tid]=true;
}
else if (tid<32)
{
iswarpvalid[tid]=false;
}
//all the others points should be on the same side of the edge i,j
//normal to the edge (unnormalized)
nx = - ( hyj- hyi);
ny = hxj- hxi;
int k=0;
while ((k==i)||(k==j))
{
k++;
} //k will be 0,1,or 2, but different from i and j to avoid
scalarProduct=nx* (h_x[k]-hxi)+ny* (h_y[k]-hyi);
if (scalarProduct<0)
{
nx*=-1;
ny*=-1;
}
for(int count = 0; count < ((NPOINTS/blockSizeX*blockSizeY) + 1); count++ ){
int globalIndex = tidY*blockSizeX + tidX + count*blockSizeX*blockSizeY;
if (NPOINTS <= globalIndex){
shared_X[tidY*blockSizeX + tidX] = -1;
shared_Y[tidY*blockSizeX + tidX] = -1;
}
else {
shared_X[tidY*blockSizeX + tidX]= h_x[globalIndex];
shared_Y[tidY*blockSizeX + tidX]= h_y[globalIndex];
}
__syncthreads();
//we have now at least one point with scalarProduct>0
//all the other points should comply with the same condition for
//the edge to be valid
//loop on all the points
if(i < j){
for (int k=0; k < blockSizeX*blockSizeY; k++)
{
if((count * blockSizeX*blockSizeY + k < NPOINTS )&&(isValid)) {
scalarProduct=nx* (shared_X[k]-hxi)+ny* (shared_Y[k]-hyi);
if(__all(scalarProduct) < 0){
iswarpvalid[(tidY*blockSizeX + tidX)/32] = false;
break;
}
else if(0 > (scalarProduct) ){
isValid = false;
break;
}
}
}
}
__syncthreads();
if (tid<32)
{
isBlockValid=__any(iswarpvalid[tid]);
}
__syncthreads();
if(!isBlockValid) break;
}
if ((i<j) && (true == isValid )){
int tmp_i = i;
int tmp_j = j;
if( -1 != atomicCAS(&h_edges[2*i], -1, tmp_j) )
h_edges[2*i+1]=j;
if( -1 != atomicCAS(&h_edges[2*j], -1, tmp_i) )
h_edges[2*j+1]=i;
}
}
答案 0 :(得分:2)
您正在寻找的答案可以在NVIDIA CUDA C编程指南中找到。
第5.4.3节规定:
__syncthreads()的吞吐量是每个时钟周期8次操作 计算能力的设备1.x和每个时钟周期16个操作 对于计算能力2.x的设备。
Warp投票功能在PTX ISA手册的B.12节和表109中进行了解决。后者表示执行经线投票需要两条指令。但是,我无法在参考文档中找出warp vote函数的任何时钟周期数字。