Question

我编写了一个CUDA函数，用于计算2D中一组点中的凸包络。但它非常比CPU代码慢！

我正在使用warp vote函数和__syncronisation（）;很多次。那么这会使代码变慢吗？

由于

添加代码：

__global__ void find_edges_on_device(TYPE * h_x, TYPE * h_y, int *h_edges){

int tidX = threadIdx.x;
int tidY = threadIdx.y;
int tid = tidY*blockSizeX + tidX;
int i = threadIdx.x+blockIdx.x*blockDim.x;
int j = threadIdx.y+blockIdx.y*blockDim.y;

int hxi = h_x[i];
int hxj = h_x[j];
int hyi = h_y[i];
int hyj = h_y[j];

long scalarProduct = 0;
TYPE nx;
TYPE ny;

bool isValid = true;

__shared__ int shared_X[blockSizeX*blockSizeY];
__shared__ int shared_Y[blockSizeX*blockSizeY];
__shared__ bool iswarpvalid[32];
__shared__ bool isBlockValid;

if (tid==0)
{
    isBlockValid=true;
}
if (tid<(blockSizeX*blockSizeY-1)/32+1)
{
    iswarpvalid[tid]=true;
}
else if (tid<32)
{
    iswarpvalid[tid]=false;
}

//all the others points should be on the same side of the edge i,j
//normal to the edge (unnormalized)
nx = - ( hyj- hyi);
ny = hxj- hxi;
int k=0;
while ((k==i)||(k==j))
{
    k++;
} //k will be 0,1,or 2, but different from i and j to avoid 
scalarProduct=nx* (h_x[k]-hxi)+ny* (h_y[k]-hyi);
if (scalarProduct<0)
{
    nx*=-1;
    ny*=-1;
}

for(int count = 0; count < ((NPOINTS/blockSizeX*blockSizeY) + 1); count++ ){

    int globalIndex = tidY*blockSizeX + tidX + count*blockSizeX*blockSizeY;

    if (NPOINTS <= globalIndex){
        shared_X[tidY*blockSizeX + tidX] = -1;
        shared_Y[tidY*blockSizeX + tidX] = -1;
    }
    else {
        shared_X[tidY*blockSizeX + tidX]= h_x[globalIndex];
        shared_Y[tidY*blockSizeX + tidX]= h_y[globalIndex];
    }
    __syncthreads();

    //we have now at least one point with scalarProduct>0
    //all the other points should comply with the same condition for
    //the edge to be valid
    //loop on all the points 

    if(i < j){
        for (int k=0; k < blockSizeX*blockSizeY; k++)
        {   
            if((count * blockSizeX*blockSizeY + k < NPOINTS )&&(isValid)) {
                scalarProduct=nx* (shared_X[k]-hxi)+ny* (shared_Y[k]-hyi);
                if(__all(scalarProduct) < 0){
                    iswarpvalid[(tidY*blockSizeX + tidX)/32] = false;
                    break;
                }
                else if(0 > (scalarProduct) ){
                    isValid = false;
                    break;
                }
            }
        }
    }

    __syncthreads();
    if (tid<32)
    {
        isBlockValid=__any(iswarpvalid[tid]);
    }
    __syncthreads();
    if(!isBlockValid) break;
}

if ((i<j) && (true == isValid )){
            int tmp_i = i;
            int tmp_j = j;

            if( -1 != atomicCAS(&h_edges[2*i], -1, tmp_j) )
                h_edges[2*i+1]=j;

            if( -1 != atomicCAS(&h_edges[2*j], -1, tmp_i) )
                h_edges[2*j+1]=i;

}
}

Answer 1

您正在寻找的答案可以在NVIDIA CUDA C编程指南中找到。

第5.4.3节规定：

__syncthreads（）的吞吐量是每个时钟周期8次操作计算能力的设备1.x和每个时钟周期16个操作对于计算能力2.x的设备。

Warp投票功能在PTX ISA手册的B.12节和表109中进行了解决。后者表示执行经线投票需要两条指令。但是，我无法在参考文档中找出warp vote函数的任何时钟周期数字。

CUDA warp vote函数会使代码变慢吗？

1 个答案: