Question

我正在尝试Cuda编程。作为其中的一部分，我试图开发一种在GPU上运行的矩阵乘法算法。该算法适用于方形矩阵，但对于非方形矩阵则失败。这是我的内核

    float* multiply_gpu(float* matrix1 , float* matrix2);
    __global__ void mult(int rowsA , int columnsA, int rowsB,int columnsB, float *a,
            float *b, float *result) {
        int index = blockIdx.x * blockDim.x + threadIdx.x;
        int result_size = rowsA*columnsB;
        int value = 0;//the final result
        //indices of values from input matrices
        if (index < result_size) {
            int index1 = (index/rowsA)*rowsA; //get nearest row
            int index2 = index%columnsB; //get start column
            int k = 0;
            while (k<columnsA) { //columnsA == rowsB
               value += a[index1]*b[index2]; //v = sum a_ik * b_kj
               index1 ++;
               index2 += columnsB;
               k++;
            }
            result[index] = value;
        }
    }

在与我的主管进行简短的健全检查后，他没有看到任何问题。我认为问题在于这个功能：

float* multiply_gpu(float* matrix1 , float* matrix2) {
    //the dimensions of the matrices
    size_t available, total;
    cudaError_t error;
    cudaError err = cudaMemGetInfo(&available, &total);
    if(err != cudaSuccess){
        printf("There was an error: %s\n", cudaGetErrorString(err));
    }
    int height1 = matrix1[0];
    int width1 = matrix1[1];
    int height2 = matrix2[0];
    int width2 = matrix2[1];
    if (width1!=height2) {
        return NULL;
    }
    //this array contains the result of the operation
    float* result = (float *) malloc(height1*width2*sizeof(float));
    //pointers for device matrices
    float *d_matrix1;
    float *d_matrix2;
    float *d_result;
    //allocate memory for matrices
    error = cudaMalloc((void **)&d_matrix1,(size_t)height1*width1*sizeof(float));
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    error = cudaMalloc((void **)&d_matrix2,height2*width2*sizeof(float));
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    error = cudaMalloc((void **)&d_result,height1*width2*sizeof(float));
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //now copy matrices onto device -- note the offset of 2
    error = cudaMemcpy(d_matrix1 , matrix1+2 , height1*width1*sizeof(float), cudaMemcpyHostToDevice);
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    error = cudaMemcpy(d_matrix2 , matrix2+2 , height2*width2*sizeof(float), cudaMemcpyHostToDevice);
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //launch multiplication kernel
//note I have tried adjusting the kernel values between <<< , >>> to no avail
    mult<<<height1,width2>>>(height1,width1,height2,width2,d_matrix1,d_matrix2,d_result); 
    printf("%d %d %d %d\n",height1,width1,height2,width2);
    //make the host block until mult is finished running
    //printf("finished multiplying\n");
    cudaDeviceSynchronize();
    //copy result back
    error = cudaMemcpy(result,d_result,height1*width2*sizeof(float),cudaMemcpyDeviceToHost);
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //free now unneeded cuda memory
    cudaFree(d_matrix1);
    cudaFree(d_matrix2);
    cudaFree(d_result);
    printf("GOT RESULT\n");
    for (int i=0;i<height1*width2;i++) {
        printf("%f ",result[i]);
    }
    printf("\n");
    //result ready to be returned
    return result;
}

请注意，作为multiply_gpu参数的矩阵的高度为索引0，宽度为索引1.结果矩阵没有此信息。

错误计算的一个例子：当我将以下数组提供给multiply_gpu - {2,3， 1,2,3,4,5,6 }，{3,2， 1,2,3,4 ，5,6 }答案应该是{22,28,49,64}，而是我的单元测试生成{22,28,40,52}。很近！请注意，对于（1,2,3）*（1,2,3）（不是正方形）的点积，算法很高兴......这里的错误是什么？谢谢你的帮助。如果我独立找到一个解决方案，我会发布解决方案。

Answer 1

这一行错了：

        int index1 = (index/rowsA)*rowsA; //get nearest row

它应该是这样的：

        int index1 = (index/columnsB)*columnsA; //get nearest row

为什么这个配方正确？ index1用于索引A中与我们正在计算的输出矩阵位置所指示的行对应的行元素。输出矩阵位置只是线程索引。如果我们（整数）将线程索引除以输出矩阵中的列的数量，即C，我们得到有问题的行号。然后，要在A中找到该行的第一个元素，我们乘以A中的列数。这正确地将我们索引到A中相关行的第一个元素。

这是一个完整的应用程序以及我的测试用例 - 我对您的代码所做的唯一更改是上面指出的更改。

$ cat t290.cu
#include <stdio.h>

__global__ void mult(int rowsA , int columnsA, int rowsB,int columnsB, float *a, float *b, float *result) {
        int index = blockIdx.x * blockDim.x + threadIdx.x;
        int result_size = rowsA*columnsB;
        int value = 0;//the final result
        //indices of values from input matrices
        if (index < result_size) {
            int index1 = (index/columnsB)*columnsA; //get nearest row
            int index2 = index%columnsB; //get start column
            int k = 0;
            while (k<columnsA) { //columnsA == rowsB
               value += a[index1]*b[index2]; //v = sum a_ik * b_kj
               index1 ++;
               index2 += columnsB;
               k++;
            }
            result[index] = value;
        }
    }

float* multiply_gpu(float* matrix1 , float* matrix2) {
    //the dimensions of the matrices
    size_t available, total;
    cudaError_t error;
    cudaError err = cudaMemGetInfo(&available, &total);
    if(err != cudaSuccess){
        printf("There was an error: %s\n", cudaGetErrorString(err));
    }
    int height1 = matrix1[0];
    int width1 = matrix1[1];
    int height2 = matrix2[0];
    int width2 = matrix2[1];
    if (width1!=height2) {
        printf("fail!\n");
        return NULL;
    }
    //this array contains the result of the operation
    float* result = (float *) malloc(height1*width2*sizeof(float));
    //pointers for device matrices
    float *d_matrix1;
    float *d_matrix2;
    float *d_result;
    //allocate memory for matrices
    error = cudaMalloc((void **)&d_matrix1,(size_t)height1*width1*sizeof(float));
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    error = cudaMalloc((void **)&d_matrix2,height2*width2*sizeof(float));
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    error = cudaMalloc((void **)&d_result,height1*width2*sizeof(float));
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //now copy matrices onto device -- note the offset of 2
    error = cudaMemcpy(d_matrix1 , matrix1+2 , height1*width1*sizeof(float), cudaMemcpyHostToDevice);
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    error = cudaMemcpy(d_matrix2 , matrix2+2 , height2*width2*sizeof(float), cudaMemcpyHostToDevice);
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //launch multiplication kernel
//note I have tried adjusting the kernel values between <<< , >>> to no avail
    mult<<<height1,width2>>>(height1,width1,height2,width2,d_matrix1,d_matrix2,d_result);
    printf("%d %d %d %d\n",height1,width1,height2,width2);
    error = cudaGetLastError();
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //make the host block until mult is finished running
    //printf("finished multiplying\n");
    error = cudaDeviceSynchronize();
    if (error != cudaSuccess) {
        fprintf(stderr, "kernel fail (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //copy result back
    error = cudaMemcpy(result,d_result,height1*width2*sizeof(float),cudaMemcpyDeviceToHost);
    if (error != cudaSuccess) {
        fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
    //free now unneeded cuda memory
    cudaFree(d_matrix1);
    cudaFree(d_matrix2);
    cudaFree(d_result);
    printf("GOT RESULT\n");
    for (int i=0;i<height1*width2;i++) {
        printf("%f ",result[i]);
    }
    printf("\n");
    //result ready to be returned
    return result;
}

int main(){

  float m1[8] = {2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
  float m2[6] = {2.0, 2.0, 1.0, 1.0, 2.0, 2.0};
  float *my_result1 = multiply_gpu(m2, m1);
  float m3[8] = {2,3,1,2,3,4,5,6};
  float m4[8] = {3,2,1,2,3,4,5,6};
  float *my_result2 = multiply_gpu(m3, m4);
  float *my_result3 = multiply_gpu(m4, m3);
  float m5[12] = {2,5,1,1,1,1,1,1,1,1,1,1};
  float m6[22] = {5,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
  float *my_result4 = multiply_gpu(m5, m6);
  return 0;
}

$ nvcc -arch=sm_20 -o t290 t290.cu
t290.cu: In function âfloat* multiply_gpu(float*, float*)â:
t290.cu:30: warning: converting to âintâ from âfloatâ
t290.cu:31: warning: converting to âintâ from âfloatâ
t290.cu:32: warning: converting to âintâ from âfloatâ
t290.cu:33: warning: converting to âintâ from âfloatâ
$ cuda-memcheck ./t290
========= CUDA-MEMCHECK
2 2 2 3
GOT RESULT
5.000000 7.000000 9.000000 10.000000 14.000000 18.000000
2 3 3 2
GOT RESULT
22.000000 28.000000 49.000000 64.000000
3 2 2 3
GOT RESULT
9.000000 12.000000 15.000000 19.000000 26.000000 33.000000 29.000000 40.000000 51.000000
2 5 5 4
GOT RESULT
5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000
========= ERROR SUMMARY: 0 errors
$

Answer 2

因此，在仔细检查我的矩阵代码后，我发现了一个简单的问题我操作的数学。

这条线路确实是错误的

 int index1 = (index/rowsA)*rowsA; //get nearest row

我注意到由于我的矩阵是按行排序的，因此从（i，j）元素获取正确索引的公式是

index = i*rowLength + j

因此，对index1的赋值应为

int index1 = (index/rowsA)*columnsA

为什么呢？很明显，要导航到行 n 的索引，我们必须移动 n 行长度（这是矩阵中的列数）。我的代码适用于方形矩阵，但不适用于其他矩形矩阵，因为列数与这种矩阵中的行数不匹配。

Cuda矩阵乘法 - 不适用于某些非平方矩阵

2 个答案: