使用并行约简的CUDA矩阵乘法

时间:2018-02-07 06:18:28

标签: c++ c cuda nvidia

我对CUDA编程很陌生,我想尝试使用Parallel Reduction实现矩阵乘法。我想出了这段代码,并希望澄清:

  1. 为什么代码返回的结果不正确。
  2. 为什么运行时间比CUDA C Programming Guide第25页第3章中描述的使用共享内存的方法要长得多。
  3. 作为参考,我在NVIDIA GeForce GTX 675M上运行它,它的计算能力为2.1。

    #include <cuda_runtime_api.h>
    #include "device_launch_parameters.h"
    
    
    #include <cuda.h>
    #include <device_functions.h>
    
    #include "cuda_runtime.h"
    
    #include <stdlib.h>
    #include <stdio.h>
    #include <time.h>
    #include <math.h>
    #define BLOCK_OPT 1024
    typedef struct {
        int width;
        int height;
        int stride;
        float* elements;
    }Matrix;
    
    
    
    
    
    __global__ void MatMulOPT(Matrix A, Matrix B, Matrix C)
    {
        __shared__ float result[BLOCK_OPT];
        int e = threadIdx.x;
        int row = blockIdx.x;
        int col = blockIdx.y;
    
        result[e] = A.elements[row*A.stride + e] * B.elements[e*B.stride + col];
        __syncthreads();
        if (e < 512)
        {
            result[e] += result[e + 512];
        }
        __syncthreads();
        if (e < 256)
        {
            result[e] += result[e + 256];
        }
        __syncthreads();
        if (e < 128)
        {
            result[e] += result[e + 128];
        }
        __syncthreads();
        if (e < 64)
        {
            result[e] += result[e + 64];
        }
        __syncthreads();
    
        if (e < 32)
        {
            result[e] += result[e + 32];
            result[e] += result[e + 16];
            result[e] += result[e + 8];
            result[e] += result[e + 4];
            result[e] += result[e + 2];
            result[e] += result[e + 1];
        }
    
        if (e == 0)C.elements[row*C.stride + col] = result[0];
    
    
    }
    void MatMulCPU(Matrix A, Matrix B, Matrix C)
    {
        for (int i = 0; i < A.height; i++)
        {
            for (int j = 0; j < B.width; j++)
            {
                for (int k = 0; k < B.height; k++)
                {
                    C.elements[i*C.stride + j] += A.elements[i*A.stride+k] * B.elements[k*B.stride + j];
                }
            }
        }
    
    }
    
    float randomFloat()
    {
        return (float)rand() / (float)RAND_MAX;
    }
    
    int main()
    {
        clock_t begin, end;
    
        srand(time(NULL));
    
        //Common Setup
        float cpu_t = 0, gpu_t = 0;
        int x = 1024;
        int N = x * x;
        size_t size = N * sizeof(float);
        Matrix A;
        A.width = x;
        A.stride = x;
        A.height = x;
        A.elements = (float*)malloc(size);
    
        for (int i = 0; i < N; i++)
            A.elements[i] = randomFloat();
    
        Matrix B;
        B.width = x;
        B.stride = x;
        B.height = x;
        B.elements = (float*)malloc(size);
    
        for (int j = 0; j < N; j++)
            B.elements[j] = randomFloat();
    
        Matrix C;
        C.width = x;
        C.stride = x;
        C.height = x;
        C.elements = (float*)malloc(size);
        for (int k = 0; k < N; k++)
            C.elements[k] = 0;
    
        Matrix D;
        D.width = x;
        D.stride = x;
        D.height = x;
        D.elements = (float*)malloc(size);
        for (int l = 0; l < N; l++)
            D.elements[l] = 0;
    
        //Execute CPU code & time it
         begin = clock();
        MatMulCPU(A, B, D);
        end = clock();
        cpu_t = (float)end - begin;
    
    
        // GPU setup
    
        Matrix d_A, d_B, d_C;
        d_A.width = x;
        d_A.stride = x;
        d_A.height = x;
        d_B.width = x;
        d_B.stride = x;
        d_B.height = x;
        d_C.width = x;
        d_C.stride = x;
        d_C.height = x;
    
    
        cudaMalloc(&d_A.elements, size);
        cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);
        cudaMalloc(&d_B.elements, size);
        cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);
        cudaMalloc(&d_C.elements, size);
        cudaMemcpy(d_C.elements, C.elements, size, cudaMemcpyHostToDevice);
    
    
        //Special Parameters
        int optBlockSize = BLOCK_OPT;
        dim3 optDimGrid(x, x);
    
        //Execute GPU Kernel and time it
        begin = clock();
        cudaThreadSynchronize();
        MatMulOPT << <optDimGrid, optBlockSize >> > (d_A, d_B, d_C);
        cudaThreadSynchronize();
        end = clock();
        gpu_t = (float)end - begin;
    
        cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);
    
    
    
    
        //Do a memcheck 
        bool passed = true;
        for (int k = 0; k < N; k++)
        {
            //printf("%f ",C.elements[k]);
            if (fabs(C.elements[k] -D.elements[k] ) > 1e-6)
            {
    
                passed = false;
                printf("\nFAIL\n");
                printf("C[%d] = %f, D[%d] = %f\n",k,C.elements[k],k,D.elements[k]);
                break;
            }
    
        }
        printf("\n");
        if (passed)
            printf("PASSED\n");
    
        printf("CPU Elapsed Time: %f\n", cpu_t);
        printf("GPU Elapsed Time: %f\n", gpu_t);
    
        //Clear all GPU memory
        cudaFree(d_A.elements);
        cudaFree(d_B.elements);
        cudaFree(d_C.elements);
        //Clear all CPU memory
        free(A.elements);
        free(B.elements);
        free(C.elements);
    
    }
    

1 个答案:

答案 0 :(得分:3)

  
      
  1. 为什么代码返回的结果不正确。
  2.   

在缩减的最后一步(e < 32)中,你正在打破你的方法。这导致了相同结果元素的竞争条件,例如:

result[e] += result[e + 16];

对于e==0,这意味着read result[16],而在e==16的同一步骤/同时,操作意味着write result[16]

为避免竞争条件,您有两种选择:

  • 使用与您的文档类似的指针volatile 链接(第78页)[已编辑]
  • 像以前一样继续if( e < ...)或将所有ifs转换为循环:

    for(int size=blockdim.x/2; size>0; size/=2)
      if(e < size)
        ...
    
  
      
  1. 为什么运行时间比CUDA C编程第25章第3章中描述的使用共享内存的方法要长得多   指南。
  2.   

访问共享内存比访问全局内存要快得多。您将中间结果存储在共享内存中,而您引用的示例是存储要读取的矩阵部分。在示例中,这与循环平铺相结合,并且每个线程仅从全局内存加载整个tile的一个元素,但稍后读取TILE_WIDTH * 2个元素。

示例的更高性能直接来自等待从全局内存加载数据的减少的时间。