我试图在4 * 4阵列上找出8 * 8阵列的块平均值。这样的事情:
我目前陷入竞争条件类型难题,每次运行程序时每个线程读取的值都会变化。现在我唯一关心的是将所有块元素添加到一起,稍后我将除以得到的总和。这是我的代码。
#include <stdio.h>
#include<math.h>
const int MAIN_SIZE = 8;
const int RESULT_SIZE = 4;
typedef int mainArray[MAIN_SIZE];
typedef int resultArray[RESULT_SIZE];
__global__ void computeMean(mainArray *main, resultArray *result) {
int mColumn = blockIdx.x * blockDim.x + threadIdx.x;
int mRow = blockIdx.y * blockDim.y + threadIdx.y;
if (mRow >= MAIN_SIZE || mColumn >= MAIN_SIZE)
return;
// real calculation
int rRow = std::floor(static_cast<float>(mRow / 2)),
rColumn = std::floor(static_cast<float>(mColumn / 2));
int x = result[rRow][rColumn] + main[mRow][mColumn];
result[rRow][rColumn] += x;
printf("Adding %d on %d %d at location %d %d; new value: %d\n", main[mRow][mColumn], mRow, mColumn, rRow, rColumn, result[rRow][rColumn]);
}
int main() {
mainArray *hMain, *dMain;
resultArray *hResult, *dResult;
size_t mSize = MAIN_SIZE * MAIN_SIZE * sizeof(int*);
size_t rSize = RESULT_SIZE * RESULT_SIZE * sizeof(int*);
hMain = (mainArray *) malloc (mSize);
hResult = (resultArray *) malloc (rSize);
// populate arrays
int k = 0;
for(int i = 0; i < MAIN_SIZE; i++) {
for(int j = 0; j < MAIN_SIZE; j++) {
hMain[i][j] = ++k;
}
}
memset(hResult, 0, rSize);
printf("main\n");
for(int i = 0; i < MAIN_SIZE; i++) {
for(int j = 0; j < MAIN_SIZE; j++) {
printf("%d ", hMain[i][j]);
}
printf("\n");
}
printf("result\n");
for(int i = 0; i < RESULT_SIZE; i++) {
for(int j = 0; j < RESULT_SIZE; j++) {
printf("%d ", hResult[i][j]);
}
printf("\n");
}
// Allocate memory on device
cudaMalloc(&dMain, mSize);
cudaMalloc(&dResult, rSize);
// Do memcopies to GPU
cudaMemcpy(dMain, hMain, mSize, cudaMemcpyHostToDevice);
cudaMemcpy(dResult, hResult, rSize, cudaMemcpyHostToDevice);
dim3 block(1, 1);
dim3 grid ((MAIN_SIZE + block.x - 1) / block.x, (MAIN_SIZE + block.y - 1) / block.y);
computeMean<<<grid, block>>>(dMain, dResult);
// Do memcopies back to host
cudaMemcpy(hMain, dMain, mSize, cudaMemcpyDeviceToHost);
cudaMemcpy(hResult, dResult, rSize, cudaMemcpyDeviceToHost);
// validate
if (cudaGetLastError() != cudaSuccess) {printf("cuda error\n"); return -1;}
printf("success!\n");
for(int i = 0; i < RESULT_SIZE; i++) {
for(int j = 0; j < RESULT_SIZE; j++) {
printf("%d ", hResult[i][j]);
}
printf("\n");
}
free(hMain);
free(hResult);
cudaFree(dMain);
cudaFree(dResult);
return 0;
}
我目前是CUDA的新手,所以如果我从一开始就使用错误的方法请告诉我(我认为我的阵列都是错的,但我无法为二维分配动态空间)。提前谢谢。
答案 0 :(得分:2)
There are a few issues with this section of code:
int rRow = std::floor(static_cast<float>(mRow / 2)),
rColumn = std::floor(static_cast<float>(mColumn / 2));
int x = result[rRow][rColumn] + main[mRow][mColumn];
result[rRow][rColumn] += x;
Because rRow
and rColumn
are found using integer division from mRow
and mColumn
, it should be evident that threads whose threadIdx.x
is either 0 or 1 will produce the same rColumn
result, and there are many other instances of duplication. As a result, you have threads that are:
CUDA doesn't sort these hazards out for you. You must take specific programming steps to deal with them. (You also have a logical error in that you are adding the main data to the existing result data, then adding that sum again to your result data; I'm reasonably sure this is not what you want.) The two typical approaches to resolve the thread hazards would be either:
For simplicity, I'll present a reworked code that demonstrates the first method.
$ cat t1324.cu
#include <stdio.h>
#include<math.h>
const int MAIN_SIZE = 8;
const int RESULT_SIZE = 4;
typedef int mainArray[MAIN_SIZE];
typedef int resultArray[RESULT_SIZE];
__global__ void computeMean(mainArray *main, resultArray *result) {
int mColumn = blockIdx.x * blockDim.x + threadIdx.x;
int mRow = blockIdx.y * blockDim.y + threadIdx.y;
if (mRow >= MAIN_SIZE || mColumn >= MAIN_SIZE)
return;
// real calculation
int rRow = std::floor(static_cast<float>(mRow / 2)),
rColumn = std::floor(static_cast<float>(mColumn / 2));
//int x = result[rRow][rColumn] + main[mRow][mColumn];
//result[rRow][rColumn] += x;
atomicAdd(&(result[rRow][rColumn]), main[mRow][mColumn]);
//printf("Adding %d on %d %d at location %d %d; new value: %d\n", main[mRow][mColumn], mRow, mColumn, rRow, rColumn, result[rRow][rColumn]);
}
int main() {
mainArray *hMain, *dMain;
resultArray *hResult, *dResult;
size_t mSize = MAIN_SIZE * MAIN_SIZE * sizeof(int*);
size_t rSize = RESULT_SIZE * RESULT_SIZE * sizeof(int*);
hMain = (mainArray *) malloc (mSize);
hResult = (resultArray *) malloc (rSize);
// populate arrays
//int k = 0;
for(int i = 0; i < MAIN_SIZE; i++) {
for(int j = 0; j < MAIN_SIZE; j++) {
hMain[i][j] = 1; //++k;
}
}
memset(hResult, 0, rSize);
printf("main\n");
for(int i = 0; i < MAIN_SIZE; i++) {
for(int j = 0; j < MAIN_SIZE; j++) {
printf("%d ", hMain[i][j]);
}
printf("\n");
}
printf("result\n");
for(int i = 0; i < RESULT_SIZE; i++) {
for(int j = 0; j < RESULT_SIZE; j++) {
printf("%d ", hResult[i][j]);
}
printf("\n");
}
// Allocate memory on device
cudaMalloc(&dMain, mSize);
cudaMalloc(&dResult, rSize);
// Do memcopies to GPU
cudaMemcpy(dMain, hMain, mSize, cudaMemcpyHostToDevice);
cudaMemcpy(dResult, hResult, rSize, cudaMemcpyHostToDevice);
dim3 block(1, 1);
dim3 grid ((MAIN_SIZE + block.x - 1) / block.x, (MAIN_SIZE + block.y - 1) / block.y);
computeMean<<<grid, block>>>(dMain, dResult);
// Do memcopies back to host
cudaMemcpy(hMain, dMain, mSize, cudaMemcpyDeviceToHost);
cudaMemcpy(hResult, dResult, rSize, cudaMemcpyDeviceToHost);
// validate
if (cudaGetLastError() != cudaSuccess) {printf("cuda error\n"); return -1;}
printf("success!\n");
for(int i = 0; i < RESULT_SIZE; i++) {
for(int j = 0; j < RESULT_SIZE; j++) {
printf("%d ", hResult[i][j]);
}
printf("\n");
}
free(hMain);
free(hResult);
cudaFree(dMain);
cudaFree(dResult);
return 0;
}
$ nvcc -arch=sm_35 -o t1324 t1324.cu
$ cuda-memcheck ./t1324
========= CUDA-MEMCHECK
main
1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1
result
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
success!
4 4 4 4
4 4 4 4
4 4 4 4
4 4 4 4
========= ERROR SUMMARY: 0 errors
$
A few other notes:
Your code here:
int rRow = std::floor(static_cast<float>(mRow / 2)),
I believe is not doing what you think. mRow/2
is integer division as you have written it. The subsequent cast to float
and then taking the floor
has no effect, I don't think. It's not hurting anything that I can see (I'm pretty sure you want integer division here), so I've left it as-is. If you want floating-point division, you need to start by casting one of your two integer operands to floating point. The code you have written does not do that. (It casts the result.)