CUDA Block Mean

时间:2017-04-09 16:27:54

标签: multidimensional-array cuda race-condition

我试图在4 * 4阵列上找出8 * 8阵列的块平均值。这样的事情:enter image description here

我目前陷入竞争条件类型难题,每次运行程序时每个线程读取的值都会变化。现在我唯一关心的是将所有块元素添加到一起,稍后我将除以得到的总和。这是我的代码。

#include <stdio.h>
#include<math.h>

const int MAIN_SIZE = 8;
const int RESULT_SIZE = 4;

typedef int mainArray[MAIN_SIZE];
typedef int resultArray[RESULT_SIZE];

__global__ void computeMean(mainArray *main, resultArray *result) {
    int mColumn = blockIdx.x * blockDim.x + threadIdx.x;
    int mRow = blockIdx.y * blockDim.y + threadIdx.y;
    if (mRow >= MAIN_SIZE || mColumn >= MAIN_SIZE)
        return;

    // real calculation
    int rRow = std::floor(static_cast<float>(mRow / 2)),
        rColumn = std::floor(static_cast<float>(mColumn / 2));
    int x = result[rRow][rColumn] + main[mRow][mColumn];
    result[rRow][rColumn] += x;
    printf("Adding %d on %d %d at location %d %d; new value: %d\n", main[mRow][mColumn], mRow, mColumn, rRow, rColumn, result[rRow][rColumn]);
}

int main() {
    mainArray *hMain, *dMain;
    resultArray *hResult, *dResult;
    size_t mSize = MAIN_SIZE * MAIN_SIZE * sizeof(int*);
    size_t rSize = RESULT_SIZE * RESULT_SIZE * sizeof(int*);
    hMain = (mainArray *) malloc (mSize);
    hResult = (resultArray *) malloc (rSize);

    // populate arrays
    int k = 0;
    for(int i = 0; i < MAIN_SIZE; i++) {
        for(int j = 0; j < MAIN_SIZE; j++) {
            hMain[i][j] = ++k;
        }
    }
    memset(hResult, 0, rSize);

    printf("main\n");
    for(int i = 0; i < MAIN_SIZE; i++) {
        for(int j = 0; j < MAIN_SIZE; j++) {
            printf("%d  ", hMain[i][j]);
        }
        printf("\n");
    }

    printf("result\n");
    for(int i = 0; i < RESULT_SIZE; i++) {
        for(int j = 0; j < RESULT_SIZE; j++) {
            printf("%d  ", hResult[i][j]);
        }
        printf("\n");
    }

    // Allocate memory on device
    cudaMalloc(&dMain, mSize);
    cudaMalloc(&dResult, rSize);

    // Do memcopies to GPU
    cudaMemcpy(dMain, hMain, mSize, cudaMemcpyHostToDevice);
    cudaMemcpy(dResult, hResult, rSize, cudaMemcpyHostToDevice);

    dim3 block(1, 1);
    dim3 grid ((MAIN_SIZE + block.x - 1) / block.x, (MAIN_SIZE + block.y - 1) / block.y);
    computeMean<<<grid, block>>>(dMain, dResult);

    // Do memcopies back to host
    cudaMemcpy(hMain, dMain, mSize, cudaMemcpyDeviceToHost);
    cudaMemcpy(hResult, dResult, rSize, cudaMemcpyDeviceToHost);

    // validate
    if (cudaGetLastError() != cudaSuccess) {printf("cuda error\n"); return -1;}

    printf("success!\n");
    for(int i = 0; i < RESULT_SIZE; i++) {
        for(int j = 0; j < RESULT_SIZE; j++) {
            printf("%d  ", hResult[i][j]);
        }
        printf("\n");
    }

    free(hMain);
    free(hResult);
    cudaFree(dMain);
    cudaFree(dResult);
    return 0;
}

我目前是CUDA的新手,所以如果我从一开始就使用错误的方法请告诉我(我认为我的阵列都是错的,但我无法为二维分配动态空间)。提前谢谢。

1 个答案:

答案 0 :(得分:2)

There are a few issues with this section of code:

int rRow = std::floor(static_cast<float>(mRow / 2)),
    rColumn = std::floor(static_cast<float>(mColumn / 2));
int x = result[rRow][rColumn] + main[mRow][mColumn];
result[rRow][rColumn] += x;

Because rRow and rColumn are found using integer division from mRow and mColumn, it should be evident that threads whose threadIdx.x is either 0 or 1 will produce the same rColumn result, and there are many other instances of duplication. As a result, you have threads that are:

  1. writing to the same location without any order or control
  2. reading from a location that other threads may be writing to

CUDA doesn't sort these hazards out for you. You must take specific programming steps to deal with them. (You also have a logical error in that you are adding the main data to the existing result data, then adding that sum again to your result data; I'm reasonably sure this is not what you want.) The two typical approaches to resolve the thread hazards would be either:

  1. Use atomics
  2. Use a classical parallel reduction method

For simplicity, I'll present a reworked code that demonstrates the first method.

$ cat t1324.cu
#include <stdio.h>
#include<math.h>

const int MAIN_SIZE = 8;
const int RESULT_SIZE = 4;

typedef int mainArray[MAIN_SIZE];
typedef int resultArray[RESULT_SIZE];

__global__ void computeMean(mainArray *main, resultArray *result) {
    int mColumn = blockIdx.x * blockDim.x + threadIdx.x;
    int mRow = blockIdx.y * blockDim.y + threadIdx.y;
    if (mRow >= MAIN_SIZE || mColumn >= MAIN_SIZE)
        return;

    // real calculation
    int rRow = std::floor(static_cast<float>(mRow / 2)),
        rColumn = std::floor(static_cast<float>(mColumn / 2));
    //int x = result[rRow][rColumn] + main[mRow][mColumn];
    //result[rRow][rColumn] += x;
    atomicAdd(&(result[rRow][rColumn]), main[mRow][mColumn]);
    //printf("Adding %d on %d %d at location %d %d; new value: %d\n", main[mRow][mColumn], mRow, mColumn, rRow, rColumn, result[rRow][rColumn]);
}

int main() {
    mainArray *hMain, *dMain;
    resultArray *hResult, *dResult;
    size_t mSize = MAIN_SIZE * MAIN_SIZE * sizeof(int*);
    size_t rSize = RESULT_SIZE * RESULT_SIZE * sizeof(int*);
    hMain = (mainArray *) malloc (mSize);
    hResult = (resultArray *) malloc (rSize);

    // populate arrays
    //int k = 0;
    for(int i = 0; i < MAIN_SIZE; i++) {
        for(int j = 0; j < MAIN_SIZE; j++) {
            hMain[i][j] = 1; //++k;
        }
    }
    memset(hResult, 0, rSize);

    printf("main\n");
    for(int i = 0; i < MAIN_SIZE; i++) {
        for(int j = 0; j < MAIN_SIZE; j++) {
            printf("%d  ", hMain[i][j]);
        }
        printf("\n");
    }

    printf("result\n");
    for(int i = 0; i < RESULT_SIZE; i++) {
        for(int j = 0; j < RESULT_SIZE; j++) {
            printf("%d  ", hResult[i][j]);
        }
        printf("\n");
    }

    // Allocate memory on device
    cudaMalloc(&dMain, mSize);
    cudaMalloc(&dResult, rSize);

    // Do memcopies to GPU
    cudaMemcpy(dMain, hMain, mSize, cudaMemcpyHostToDevice);
    cudaMemcpy(dResult, hResult, rSize, cudaMemcpyHostToDevice);

    dim3 block(1, 1);
    dim3 grid ((MAIN_SIZE + block.x - 1) / block.x, (MAIN_SIZE + block.y - 1) / block.y);
    computeMean<<<grid, block>>>(dMain, dResult);

    // Do memcopies back to host
    cudaMemcpy(hMain, dMain, mSize, cudaMemcpyDeviceToHost);
    cudaMemcpy(hResult, dResult, rSize, cudaMemcpyDeviceToHost);

    // validate
    if (cudaGetLastError() != cudaSuccess) {printf("cuda error\n"); return -1;}

    printf("success!\n");
    for(int i = 0; i < RESULT_SIZE; i++) {
        for(int j = 0; j < RESULT_SIZE; j++) {
            printf("%d  ", hResult[i][j]);
        }
        printf("\n");
    }

    free(hMain);
    free(hResult);
    cudaFree(dMain);
    cudaFree(dResult);
    return 0;
}
$ nvcc -arch=sm_35 -o t1324 t1324.cu
$ cuda-memcheck ./t1324
========= CUDA-MEMCHECK
main
1  1  1  1  1  1  1  1
1  1  1  1  1  1  1  1
1  1  1  1  1  1  1  1
1  1  1  1  1  1  1  1
1  1  1  1  1  1  1  1
1  1  1  1  1  1  1  1
1  1  1  1  1  1  1  1
1  1  1  1  1  1  1  1
result
0  0  0  0
0  0  0  0
0  0  0  0
0  0  0  0
success!
4  4  4  4
4  4  4  4
4  4  4  4
4  4  4  4
========= ERROR SUMMARY: 0 errors
$

A few other notes:

  1. I changed the initialization data so I could quickly identify correct output.
  2. Your code here:

    int rRow = std::floor(static_cast<float>(mRow / 2)),
    

    I believe is not doing what you think. mRow/2 is integer division as you have written it. The subsequent cast to float and then taking the floor has no effect, I don't think. It's not hurting anything that I can see (I'm pretty sure you want integer division here), so I've left it as-is. If you want floating-point division, you need to start by casting one of your two integer operands to floating point. The code you have written does not do that. (It casts the result.)