Question

我是并行编程的新手，感谢您的帮助，了解它是如何工作的。这是一个人为的例子，我希望在矩阵的每个单元格中操作的结果为50。

结果取决于[index + 1]处数组中的值。这在并行编程中不能很好地工作，因为值不是按顺序计算的，并且每隔几个单元就会得到不正确的结果。我有乐队的帮助将功能分成多个，但我认为应该有更好的解决方案，但我并不确定要搜索什么。谢谢。

CUDA代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>

#include <iostream>

#define TILE_WIDTH 16

using namespace std;

__global__ void cuda_arithmetic(int height, int width, float *B, float *C, float *initial_array, float *result_array){

    int             w                   =   blockIdx.x * blockDim.x + threadIdx.x; // Col // width
    int             h                   =   blockIdx.y * blockDim.y + threadIdx.y; // Row // height
    int             index               =   h * width + w;

    if ((w < width) && h < (height))                    //initial=20, B=2, C=10, result = 17;
        initial_array[index] = powf(C[index],2);

    if ((w < (width-1)) && h < (height))
        result_array[index] = initial_array[index+1] / B[index];
}

__global__ void cuda_arithmetic_step_1(int height, int width, float *B, float *C, float *initial_array, float *result_array){

    int             w                   =   blockIdx.x * blockDim.x + threadIdx.x; // Col // width
    int             h                   =   blockIdx.y * blockDim.y + threadIdx.y; // Row // height
    int             index               =   h * width + w;

    if ((w < width) && h < (height))
        initial_array[index] = powf(C[index],2);
}

__global__ void cuda_arithmetic_step_2(int height, int width, float *B, float *C, float *initial_array, float *result_array){

    int             w                   =   blockIdx.x * blockDim.x + threadIdx.x; // Col // width
    int             h                   =   blockIdx.y * blockDim.y + threadIdx.y; // Row // height
    int             index               =   h * width + w;

    if ((w < (width-1)) && h < (height))
        result_array[index] = initial_array[index+1] / B[index];
}

int main(){

    int             height              =   800;
    int             width               =   8192;

    float           *A                  =   new float[height * width];
    float           *B                  =   new float[height * width];
    float           *C                  =   new float[height * width];
    float           *result             =   new float[height * width];

    for (int i = 0; i < height; i++){
        for (int j = 0; j < width; j++){
            A[i*width+j] = 20;
            B[i*width+j] = 2;
            C[i*width+j] = 10;
            result[i*width+j] = 17;
        }
    }

    float           *gpu_A;
    float           *gpu_B;
    float           *gpu_C;
    float           *gpu_result;

    cudaMalloc((void **)&gpu_A,         (height * width * sizeof(float)));
    cudaMalloc((void **)&gpu_B,         (height * width * sizeof(float)));
    cudaMalloc((void **)&gpu_C,         (height * width * sizeof(float)));
    cudaMalloc((void **)&gpu_result,    (height * width * sizeof(float)));

    cudaMemcpy(gpu_A,       A,          (height * width * sizeof(float)), cudaMemcpyHostToDevice);
    cudaMemcpy(gpu_B,       B,          (height * width * sizeof(float)), cudaMemcpyHostToDevice); 
    cudaMemcpy(gpu_C,       C,          (height * width * sizeof(float)), cudaMemcpyHostToDevice); 
    cudaMemcpy(gpu_result,  result,     (height * width * sizeof(float)), cudaMemcpyHostToDevice);

    dim3            dimGrid((width - 1) / TILE_WIDTH + 1, (height - 1)/TILE_WIDTH + 1, 1);
    dim3            dimBlock(TILE_WIDTH, TILE_WIDTH, 1);

    // CODE OPTION

    // incorrect result
    cuda_arithmetic<<<dimGrid,dimBlock>>>(height, width, gpu_B, gpu_C, gpu_A, gpu_result);

    // correct result
    //cuda_arithmetic_step_1<<<dimGrid,dimBlock>>>(height, width, gpu_B, gpu_C, gpu_A, gpu_result);
    //cuda_arithmetic_step_2<<<dimGrid,dimBlock>>>(height, width, gpu_B, gpu_C, gpu_A, gpu_result);

    cudaMemcpy(result, gpu_result, (height * width * sizeof(float)), cudaMemcpyDeviceToHost);

    for (int i = 0; i < height; i++){
        for (int j = 0; j < (width-1); j++){
            if (abs((result[i*(width-1)+j] - 50)) > 0.001){
                cout << "error: ";
                cout << i << " * " << width-1 << " + " << j << ": " << result[i*(width-1)+j] << endl;
                system("pause");
            }
        }
        cout << endl;
    }
    cout << endl;

    cudaFree(gpu_A);
    cudaFree(gpu_B);
    cudaFree(gpu_C);
    cudaFree(gpu_result);

    delete[] A;
    delete[] B;
    delete[] C;
    delete[] result;

    system("pause");
}

Answer 1

由于你的例子是人为的，我的回答会有点笼统。

通常，您正在处理全局同步问题。

正如您所发现的那样，唯一干净的全局同步点是内核启动，因此在必要的同步点之前和之后将代码分成几部分会因内核启动而插入全局同步（ ES）。
另一种方法是考虑必要的同步是否可以本地化。如果是这样，您可以考虑安排算法/数据，以便在线程块内处理必要的同步（共享内存和__syncthreads()为我们提供内置的协调/同步功能。）这可能会遇到一些挑战数据边界（例如，线程间边界）。处理边界数据的一种方法是使相邻的线程块在边界区域中执行冗余计算，以便保证每个线程块在计算任何最终结果之前产生所有必要的中间结果。在这种情况下，您可以使用__syncthreads()安全地将中间结果的计算与最终结果分开，initial_array[index+1] = powf(C[index+1],2);是一个内部线程块barrier。
在某些情况下，您可以减少对单个线程的依赖性。例如，在您的代码中，您可以使单个线程执行必要的计算：
```
result_array[index] = initial_array[index+1] / B[index];
```
和依赖结果计算：
```
index+1
```
由于保证在计算必要的中间结果之后执行从属计算，因此不需要其他同步。可能你的实际代码可能不适合这种微不足道的重写。

顺便说一句，请注意您对if (abs((result[i*(width-1)+j] - 50)) > 0.001){的使用将超出内核中最后一个threadblock的范围（w = width -1，h = height-1）。另外，我不认为这个索引是你想要的：

        if (abs((result[i*(width)+j] - 50)) > 0.001){

我想你可能就是这个意思：

cuda_arithmetic

通过这些更改，您的{{1}}内核可以正常运行（即使它有一个轻微的越界问题。）

Answer 2

使用__syncthreads();。 __syncthreads之前的所有代码将在此内核启动的所有线程中执行，然后执行任何代码。

另请注意，分离读写操作以避免冲突（当多个线程从同一地址读写时）是个好主意。例如

array[i] = array2[i]

应该重新编写为

   temp = array2[i]; 
   __syncthreads();
   array[i] = temp;
   __syncthreads();

CUDA：有没有办法在继续之前强制完成每一行？

2 个答案: