Question

我试图从全局函数调用设备函数。此函数仅声明所有线程都使用的数组。但是当我打印数组时，我的问题是它的元素与声明的顺序不同。是因为所有线程都在重新创建数组吗？我对线程感到困惑。如果是，我可以在全局函数中了解哪个线程首先运行，并且我只能允许它为其他线程声明该数组。谢谢。这里是我创建数组的函数：

__device__ float myArray[20][20];

__device__ void calculation(int no){
filterWidth = 3+(2*no);
filterHeight = 3+(2*no);
int arraySize = filterWidth;
int middle = (arraySize - 1) / 2;
int startIndex = middle;
int stopIndex = middle;

// at first , all values of array are 0
for(int i=0; i<arraySize; i++)
    for (int j = 0; j < arraySize; j++)
    {
        myArray[i][j] = 0;
    }

//  until middle line of the array, required indexes are 1
for (int i = 0; i < middle; i++)
{
    for (int j = startIndex; j <= stopIndex; j++)
    { myArray[i][j] = 1; sum+=1; }
    startIndex -= 1;
    stopIndex += 1;
}

// for middle line
for (int i = 0; i < arraySize; i++)
{myArray[middle][i] = 1; sum+=1;}

// after middle line of the array, required indexes are 1
startIndex += 1;
stopIndex -= 1;
for (int i = (middle + 1); i < arraySize; i++)
{
    for (int j = startIndex; j <= stopIndex; j++)
    { myArray[i][j] = 1; sum+=1; }
    startIndex +=1 ;
    stopIndex -= 1;
}


filterFactor = 1.0f / sum;
  }

全球职能：

__global__ void FilterKernel(Format24bppRgb* imageData)
  {
int tidX = threadIdx.x + blockIdx.x * blockDim.x;
int tidY = threadIdx.y + blockIdx.y * blockDim.y;

Colour Cpixel = Colour (imageData[tidX + tidY*imageWidth] );
float depthPixel =  Colour(depthData[tidX + tidY*imageWidth]).Red;
float absoluteDistanceFromFocus = fabs (depthPixel - focusDepth);


if(depthPixel == 0)
    return;

Colour Cresult = Cpixel;
for (int i=0;i<8;i++)
{
    calculation(i);
     ...
     ...
    }
 }

Answer 1

如果你真的想选择并强制一个线程调用该函数而其余线程等待它执行此操作，请对设备函数创建的数组使用__shared__内存，以便块中的所有线程看到同一个，你可以用：

来调用它

for (int i=0;i<8;i++)
{
    if (threadIdx.x == 0 && threadIdx.y == 0)
        calculation(i);
    __syncthreads();
    ...
}

当然，这在块之间不起作用 - 在全局定义的函数中，您无法控制计算块的顺序。

相反，如果可以的话，你应该在CPU上进行初始化计算（只有1个线程需要做）并在启动内核之前将其存储到GPU。看起来你会为你的myArray使用8倍的内存，但它会大大加快你的计算速度。

来自全球的cuda到达设备功能

1 个答案: