Cuda内核函数只改变矩阵的第一行

时间:2013-10-07 19:20:56

标签: matrix cuda parallel-processing

我试图总结两个矩阵a_h_1和a_h_2,并将结果写回a_h_1。但由于某种原因,我的内核函数不会更改除前N个元素之外的数组成员。例如,即使我写了[8] = 45,当它被复制回主机时也会打印为8。有什么问题?

#include <stdio.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cuda.h>
// Kernel that executes on the CUDA device
__global__ void matrix_summation(float *a, float *b, int M, int N)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx<M*N)
    {
        a[idx] = blockIdx.x;
    }
}
// main routine that executes on the host
int main(void)
{
    float *a_h_1,*a_h_2, *a_d_1,*a_d_2; // Pointer to host & device arrays
    const int N = 5;
    const int M = 5;
    // Number of elements in arrays
    size_t size = (N * M) * sizeof(float);
    a_h_1 = (float *)malloc(size); // Allocate array1 on host
    a_h_2 = (float *)malloc(size); // Allocate array2 on host
    cudaMalloc((void **) &a_d_1, size); // Allocate array1 on device
    cudaMalloc((void **) &a_d_2, size); // Allocate array2 on device
    // Initialize host array and copy it to CUDA device
    for (int i=0; i<N*M; i++){
        a_h_1[i] = (float)i;
        a_h_2[i] = (float)i;
    }
    cudaMemcpy(a_d_1, a_h_1, size, cudaMemcpyHostToDevice);
    cudaMemcpy(a_d_2, a_h_2, size, cudaMemcpyHostToDevice);
    // Do calculation on device:
    int block_size = M;
    int n_blocks = (M*N)/block_size;
    matrix_summation <<< n_blocks, block_size >>> ( a_d_1,a_d_2, M, N));
    // Retrieve result from device and store it in host array
    cudaMemcpy(a_h_1, a_d_1, sizeof(float)*N, cudaMemcpyDeviceToHost);
    // Print results
    printf("\n\nROW 1 \n");
    for (int i=0; i<(M*N); i++)
    {
        printf(" %f ", a_h_1[i]);       
        if((i+1)%N == 0)
        {
            printf("\nROW %d \n", ((i+1)/N)+1);
        }
    }
    // Cleanup
    free(a_h_1);
    free(a_h_2);
    cudaFree(a_d_1);
    cudaFree(a_d_2);
    system("pause");
}

这是输出:

ROW 1
0.0 2.0 4.0 6.0 8.0    < this line is correct but others are not
ROW 2
5.0 6.0 7.0 8.0 9.0
ROW 3
10.0 11.0 12.0 13.0 14.0
ROW 4
15.0 16.0 17.0 18.0 19.0
ROW 5
20.0 21.0 22.0 23.0 24.0

1 个答案:

答案 0 :(得分:1)

您好像没有将所有设备阵列复制到主机阵列。在这一行:

cudaMemcpy(a_h_1, a_d_1, sizeof(float)*N, cudaMemcpyDeviceToHost);

我认为您打算复制sizeof(float)*N*M