我刚刚学习了GPU编程,现在我有一项任务是通过在CUDA上并行来从100x100矩阵中找到最小值。我试过这段代码,但它没有显示答案,而是显示我的初始值hmin = 9999999
。有人能给我正确的代码吗?哦,代码是C语言。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define size (100*100)
//Kernel Functions & Variable
__global__ void FindMin(int* mat[100][100],int* kmin){
int b=blockIdx.x+threadIdx.x*blockDim.x;
int k=blockIdx.y+threadIdx.y*blockDim.y;
if(mat[b][k] < kmin){
kmin = mat[b][k];
}
}
int main(int argc, char *argv[]) {
//Declare Variabel
int i,j,hmaks=0,hmin=9999999,hsumin,hsumax; //Host Variable
int *da[100][100],*dmin,*dmaks,*dsumin,*dsumax; // Device Variable
FILE *baca; //for opening txt file
char buf[4]; //used for fscanf
int ha[100][100],b; //matrix shall be filled by "b"
//1: Read txt File
baca=fopen("MatrixTubes1.txt","r");
if (!baca){
printf("Hey, it's not even exist"); //Checking File, is it there?
}
i=0;j=0; //Matrix index initialization
if(!feof(baca)){ //if not end of file then do
for(i = 0; i < 100; i++){
for(j = 0; j < 100; j++){
fscanf(baca,"%s",buf); //read max 4 char
b=atoi(buf); //parsing from string to integer
ha[i][j]=b; //save it to my matrix
}
}
}
fclose(baca);
//all file has been read
//time to close the file
//Sesi 2: Allocation data di GPU
cudaMalloc((void **)&da, size*sizeof(int));
cudaMalloc((void **)&dmin, sizeof(int));
cudaMalloc((void **)&dmaks, sizeof(int));
cudaMalloc((void **)&dsumin, sizeof(int));
cudaMalloc((void **)&dsumax, sizeof(int));
//Sesi 3: Copy data to Device
cudaMemcpy(da, &ha, size*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dmin, &hmin, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dmaks, &hmaks, sizeof(int), cudaMemcpyHostToDevice);
//Sesi 4: Call Kernel
FindMin<<<100,100,1>>>(da,dmin);
//5: Copy from Device to Host
cudaMemcpy(&hmin, dmin, sizeof(int), cudaMemcpyDeviceToHost);
//6: Print that value
printf("Minimum Value = %i \n",hmin);
system("pause"); return 0;
}
这是我的结果
Minimum Value = 9999999
Press any key to continue . . .
答案 0 :(得分:-1)
我在您的代码中看到了一些问题。
b
和k
的范围仅为0到99,因为threadIdx.x
始终为零。kmin = mat[b][k]
中的竞争条件(顺便说一下,它应该是*kmin
)。修复索引问题后,同一块中的所有线程将同时写入全局内存中的位置。您应该使用atomicMin()
来并行查找最小值。