Question

我试图在4 * 4阵列上找出8 * 8阵列的块平均值。这样的事情：

我目前陷入竞争条件类型难题，每次运行程序时每个线程读取的值都会变化。现在我唯一关心的是将所有块元素添加到一起，稍后我将除以得到的总和。这是我的代码。

#include <stdio.h>
#include<math.h>

const int MAIN_SIZE = 8;
const int RESULT_SIZE = 4;

typedef int mainArray[MAIN_SIZE];
typedef int resultArray[RESULT_SIZE];

__global__ void computeMean(mainArray *main, resultArray *result) {
    int mColumn = blockIdx.x * blockDim.x + threadIdx.x;
    int mRow = blockIdx.y * blockDim.y + threadIdx.y;
    if (mRow >= MAIN_SIZE || mColumn >= MAIN_SIZE)
        return;

    // real calculation
    int rRow = std::floor(static_cast<float>(mRow / 2)),
        rColumn = std::floor(static_cast<float>(mColumn / 2));
    int x = result[rRow][rColumn] + main[mRow][mColumn];
    result[rRow][rColumn] += x;
    printf("Adding %d on %d %d at location %d %d; new value: %d\n", main[mRow][mColumn], mRow, mColumn, rRow, rColumn, result[rRow][rColumn]);
}

int main() {
    mainArray *hMain, *dMain;
    resultArray *hResult, *dResult;
    size_t mSize = MAIN_SIZE * MAIN_SIZE * sizeof(int*);
    size_t rSize = RESULT_SIZE * RESULT_SIZE * sizeof(int*);
    hMain = (mainArray *) malloc (mSize);
    hResult = (resultArray *) malloc (rSize);

    // populate arrays
    int k = 0;
    for(int i = 0; i < MAIN_SIZE; i++) {
        for(int j = 0; j < MAIN_SIZE; j++) {
            hMain[i][j] = ++k;
        }
    }
    memset(hResult, 0, rSize);

    printf("main\n");
    for(int i = 0; i < MAIN_SIZE; i++) {
        for(int j = 0; j < MAIN_SIZE; j++) {
            printf("%d  ", hMain[i][j]);
        }
        printf("\n");
    }

    printf("result\n");
    for(int i = 0; i < RESULT_SIZE; i++) {
        for(int j = 0; j < RESULT_SIZE; j++) {
            printf("%d  ", hResult[i][j]);
        }
        printf("\n");
    }

    // Allocate memory on device
    cudaMalloc(&dMain, mSize);
    cudaMalloc(&dResult, rSize);

    // Do memcopies to GPU
    cudaMemcpy(dMain, hMain, mSize, cudaMemcpyHostToDevice);
    cudaMemcpy(dResult, hResult, rSize, cudaMemcpyHostToDevice);

    dim3 block(1, 1);
    dim3 grid ((MAIN_SIZE + block.x - 1) / block.x, (MAIN_SIZE + block.y - 1) / block.y);
    computeMean<<<grid, block>>>(dMain, dResult);

    // Do memcopies back to host
    cudaMemcpy(hMain, dMain, mSize, cudaMemcpyDeviceToHost);
    cudaMemcpy(hResult, dResult, rSize, cudaMemcpyDeviceToHost);

    // validate
    if (cudaGetLastError() != cudaSuccess) {printf("cuda error\n"); return -1;}

    printf("success!\n");
    for(int i = 0; i < RESULT_SIZE; i++) {
        for(int j = 0; j < RESULT_SIZE; j++) {
            printf("%d  ", hResult[i][j]);
        }
        printf("\n");
    }

    free(hMain);
    free(hResult);
    cudaFree(dMain);
    cudaFree(dResult);
    return 0;
}

我目前是CUDA的新手，所以如果我从一开始就使用错误的方法请告诉我（我认为我的阵列都是错的，但我无法为二维分配动态空间）。提前谢谢。

Answer 1

There are a few issues with this section of code:

int rRow = std::floor(static_cast<float>(mRow / 2)),
    rColumn = std::floor(static_cast<float>(mColumn / 2));
int x = result[rRow][rColumn] + main[mRow][mColumn];
result[rRow][rColumn] += x;

Because rRow and rColumn are found using integer division from mRow and mColumn, it should be evident that threads whose threadIdx.x is either 0 or 1 will produce the same rColumn result, and there are many other instances of duplication. As a result, you have threads that are:

writing to the same location without any order or control
reading from a location that other threads may be writing to

CUDA doesn't sort these hazards out for you. You must take specific programming steps to deal with them. (You also have a logical error in that you are adding the main data to the existing result data, then adding that sum again to your result data; I'm reasonably sure this is not what you want.) The two typical approaches to resolve the thread hazards would be either:

Use atomics
Use a classical parallel reduction method

For simplicity, I'll present a reworked code that demonstrates the first method.

$ cat t1324.cu
#include <stdio.h>
#include<math.h>

const int MAIN_SIZE = 8;
const int RESULT_SIZE = 4;

typedef int mainArray[MAIN_SIZE];
typedef int resultArray[RESULT_SIZE];

__global__ void computeMean(mainArray *main, resultArray *result) {
    int mColumn = blockIdx.x * blockDim.x + threadIdx.x;
    int mRow = blockIdx.y * blockDim.y + threadIdx.y;
    if (mRow >= MAIN_SIZE || mColumn >= MAIN_SIZE)
        return;

    // real calculation
    int rRow = std::floor(static_cast<float>(mRow / 2)),
        rColumn = std::floor(static_cast<float>(mColumn / 2));
    //int x = result[rRow][rColumn] + main[mRow][mColumn];
    //result[rRow][rColumn] += x;
    atomicAdd(&(result[rRow][rColumn]), main[mRow][mColumn]);
    //printf("Adding %d on %d %d at location %d %d; new value: %d\n", main[mRow][mColumn], mRow, mColumn, rRow, rColumn, result[rRow][rColumn]);
}

int main() {
    mainArray *hMain, *dMain;
    resultArray *hResult, *dResult;
    size_t mSize = MAIN_SIZE * MAIN_SIZE * sizeof(int*);
    size_t rSize = RESULT_SIZE * RESULT_SIZE * sizeof(int*);
    hMain = (mainArray *) malloc (mSize);
    hResult = (resultArray *) malloc (rSize);

    // populate arrays
    //int k = 0;
    for(int i = 0; i < MAIN_SIZE; i++) {
        for(int j = 0; j < MAIN_SIZE; j++) {
            hMain[i][j] = 1; //++k;
        }
    }
    memset(hResult, 0, rSize);

    printf("main\n");
    for(int i = 0; i < MAIN_SIZE; i++) {
        for(int j = 0; j < MAIN_SIZE; j++) {
            printf("%d  ", hMain[i][j]);
        }
        printf("\n");
    }

    printf("result\n");
    for(int i = 0; i < RESULT_SIZE; i++) {
        for(int j = 0; j < RESULT_SIZE; j++) {
            printf("%d  ", hResult[i][j]);
        }
        printf("\n");
    }

    // Allocate memory on device
    cudaMalloc(&dMain, mSize);
    cudaMalloc(&dResult, rSize);

    // Do memcopies to GPU
    cudaMemcpy(dMain, hMain, mSize, cudaMemcpyHostToDevice);
    cudaMemcpy(dResult, hResult, rSize, cudaMemcpyHostToDevice);

    dim3 block(1, 1);
    dim3 grid ((MAIN_SIZE + block.x - 1) / block.x, (MAIN_SIZE + block.y - 1) / block.y);
    computeMean<<<grid, block>>>(dMain, dResult);

    // Do memcopies back to host
    cudaMemcpy(hMain, dMain, mSize, cudaMemcpyDeviceToHost);
    cudaMemcpy(hResult, dResult, rSize, cudaMemcpyDeviceToHost);

    // validate
    if (cudaGetLastError() != cudaSuccess) {printf("cuda error\n"); return -1;}

    printf("success!\n");
    for(int i = 0; i < RESULT_SIZE; i++) {
        for(int j = 0; j < RESULT_SIZE; j++) {
            printf("%d  ", hResult[i][j]);
        }
        printf("\n");
    }

    free(hMain);
    free(hResult);
    cudaFree(dMain);
    cudaFree(dResult);
    return 0;
}
$ nvcc -arch=sm_35 -o t1324 t1324.cu
$ cuda-memcheck ./t1324
========= CUDA-MEMCHECK
main
1  1  1  1  1  1  1  1
1  1  1  1  1  1  1  1
1  1  1  1  1  1  1  1
1  1  1  1  1  1  1  1
1  1  1  1  1  1  1  1
1  1  1  1  1  1  1  1
1  1  1  1  1  1  1  1
1  1  1  1  1  1  1  1
result
0  0  0  0
0  0  0  0
0  0  0  0
0  0  0  0
success!
4  4  4  4
4  4  4  4
4  4  4  4
4  4  4  4
========= ERROR SUMMARY: 0 errors
$

A few other notes:

I changed the initialization data so I could quickly identify correct output.
Your code here:
```
int rRow = std::floor(static_cast<float>(mRow / 2)),
```
I believe is not doing what you think. mRow/2 is integer division as you have written it. The subsequent cast to float and then taking the floor has no effect, I don't think. It's not hurting anything that I can see (I'm pretty sure you want integer division here), so I've left it as-is. If you want floating-point division, you need to start by casting one of your two integer operands to floating point. The code you have written does not do that. (It casts the result.)

CUDA Block Mean

1 个答案: