我正在尝试创建一个双元素数组,其中每个元素都是元素的总和。但是,添加新值后,该值将丢失,并且向量vet再次用零填充,好像什么也没发生。
我尝试使用共享向量无济于事。还尝试同步该过程中涉及的线程。
#include <cuda_runtime.h>
#include <stdlib.h>
#include <stdio.h>
__global__ void calcula_media(double *matriz, double *vet, int lin, int col) {
int i=blockDim.x*blockIdx.x+threadIdx.x;
int j=blockDim.y*blockIdx.y+threadIdx.y;
if(i<lin && j<col) {
vet[j]+=matriz[j*lin+i];
}
}
int main(int argc, char **argv) {
int i, j, lin, col;
double *matriz, *media;
double *matriz_d, *media_d;
// Reads the dimensions of the matrix
fscanf(stdin, "%d ", &lin);
fscanf(stdin, "%d\n", &col);
// Host variables
cudaMallocHost((void **)&matriz, lin*col*sizeof(double));
cudaMallocHost((void **)&media, col*sizeof(double));
// Device variables
cudaMalloc((void **)&matriz_d, lin*col*sizeof(double));
cudaMalloc((void **)&media_d, col*sizeof(double));
// Reads data from the input file
for(i=0; i<lin; i++) {
for(j=0; j<col; j++) {
fscanf(stdin, "%lf ", &(matriz[j*lin+i]));
}
}
// Copies content from matriz to matriz_d
cudaMemcpy(matriz_d, matriz, lin*col*sizeof(double), cudaMemcpyHostToDevice);
// Threads per block
dim3 tpb(16, 16);
// Grids per block
dim3 bpg((lin*col+tpb.x-1)/tpb.x, (lin*col+tpb.y-1)/tpb.y);
// Initializes media_d with zeros
cudaMemset(media_d, 0, col*sizeof(double));
// Launches the kernel
calcula_media<<<bpg, tpb>>>(matriz_d, media_d, lin, col);
// Copies the result back to media
cudaMemcpy(media, media_d, col*sizeof(double), cudaMemcpyDeviceToHost);
// Prints the result
for(i=0; i<col; i++)
printf("%.1lf ", media[i]);
// Frees the allocated memory
cudaFree(matriz);
cudaFree(media);
// Desaloca memória do device
cudaFree(matriz_d);
cudaFree(media_d);
return 0;
}
对于此输入:
6 4
1 7 13 19
2 8 14 20
3 9 15 21
4 10 16 22
5 11 17 23
6 12 18 24
我希望输出为:
21.0 (the result of 1+2+...+6) 57.0 (the result of 7+8+...+12) 93.0 (the result of 13+14+...+18) 129.0 (the result of 19+20+...24)
,
但实际输出为6.0 12.0 18.0 24.0
。