Jetson TK1上的CUDA Zero Copy与CudaMemcpy

时间:2016-04-21 02:38:06

标签: c++ mobile cuda gpu

我的问题: 我正在寻找某人要么指出我试图在CUDA中使用实现零拷贝的方式中的错误,要么揭示更多“幕后”透视为什么零拷贝方法不会比memcpy方法更快。顺便说一句,我正在使用Ubuntu对NVidia的TK1处理器进行测试。

我的问题与使用CIDA有效地使用NVIDIA TK1(物理)统一内存架构有关。 NVIDIA提供了两种GPU / CPU内存传输抽象方法。

  1. 统一内存抽象(使用cudaHostAlloc& cudaHostGetDevicePointer)
  2. 显式复制到主机和设备(使用cudaMalloc()& cudaMemcpy)
  3. 我的测试代码的简短描述:我使用方法1和2测试了相同的cuda内核。鉴于源数据的设备没有复制或结果数据的设备没有复制,我预计1会更快。然而,结果倒退到我的假设(方法#1慢50%)。以下是我测试的代码:

    #include <libfreenect/libfreenect.hpp>
    #include <iostream>
    #include <vector>
    #include <cmath>
    #include <pthread.h>
    #include <cxcore.h>
    #include <time.h>
    #include <sys/time.h>
    #include <memory.h>
    ///CUDA///
    #include <cuda.h>
    #include <cuda_runtime.h>
    
     ///OpenCV 2.4
    #include <highgui.h>
    #include <cv.h>
    #include <opencv2/gpu/gpu.hpp>
    
    using namespace cv;
    using namespace std;
    
    ///The Test Kernel///
    __global__ void cudaCalcXYZ( float *dst, float *src, float *M, int height, int width, float scaleFactor, int minDistance)
    {
        float nx,ny,nz, nzpminD, jFactor;
        int heightCenter = height / 2;
        int widthCenter = width / 2;
        //int j = blockIdx.x;   //Represents which row we are in
        int index = blockIdx.x*width;
        jFactor = (blockIdx.x - heightCenter)*scaleFactor;
        for(int i= 0; i < width; i++)
        {
            nz = src[index];
            nzpminD = nz + minDistance;
            nx = (i - widthCenter )*(nzpminD)*scaleFactor;      
            ny = (jFactor)*(nzpminD);   
            //Solve for only Y matrix (height vlaues)           
             dst[index++] = nx*M[4] + ny*M[5] + nz*M[6];
            //dst[index++] = 1 + 2 + 3;
        }
    }
    
    //Function fwd declarations
    double getMillis();
    double getMicros();
    void runCudaTestZeroCopy(int iter, int cols, int rows);
    void runCudaTestDeviceCopy(int iter, int cols, int rows);
    
    int main(int argc, char **argv) {
    
        //ZERO COPY FLAG (allows runCudaTestZeroCopy to run without fail)
        cudaSetDeviceFlags(cudaDeviceMapHost);
    
        //Runs kernel using explicit data copy to 'device' and back from 'device'
        runCudaTestDeviceCopy(20, 640,480);
        //Uses 'unified memory' cuda abstraction so device can directly work from host data
        runCudaTestZeroCopy(20,640, 480);
    
        std::cout << "Stopping test" << std::endl;
    
        return 0;
    }
    
    void runCudaTestZeroCopy(int iter, int cols, int rows)
    {
        cout << "CUDA Test::ZEROCOPY" << endl;
            int src_rows = rows;
            int src_cols = cols;
            int m_rows = 4;
            int m_cols = 4;
            int dst_rows = src_rows;
            int dst_cols = src_cols;
            //Create and allocate memory for host mats pointers
            float *psrcMat;
            float *pmMat;
            float *pdstMat;
            cudaHostAlloc((void **)&psrcMat, src_rows*src_cols*sizeof(float), cudaHostAllocMapped);
            cudaHostAlloc((void **)&pmMat, m_rows*m_cols*sizeof(float), cudaHostAllocMapped);
            cudaHostAlloc((void **)&pdstMat, dst_rows*dst_cols*sizeof(float), cudaHostAllocMapped);
            //Create mats using host pointers
            Mat src_mat = Mat(cvSize(src_cols, src_rows), CV_32FC1, psrcMat);
            Mat m_mat   = Mat(cvSize(m_cols, m_rows), CV_32FC1, pmMat);
            Mat dst_mat = Mat(cvSize(dst_cols, dst_rows), CV_32FC1, pdstMat);
    
            //configure src and m mats
            for(int i = 0; i < src_rows*src_cols; i++)
            {
                psrcMat[i] = (float)i;
            }
            for(int i = 0; i < m_rows*m_cols; i++)
            {
                pmMat[i] = 0.1234;
            }
            //Create pointers to dev mats
            float *d_psrcMat;
            float *d_pmMat;
            float *d_pdstMat;
            //Map device to host pointers
            cudaHostGetDevicePointer((void **)&d_psrcMat, (void *)psrcMat, 0);
            //cudaHostGetDevicePointer((void **)&d_pmMat, (void *)pmMat, 0);
            cudaHostGetDevicePointer((void **)&d_pdstMat, (void *)pdstMat, 0);
            //Copy matrix M to device
            cudaMalloc( (void **)&d_pmMat, sizeof(float)*4*4 ); //4x4 matrix
            cudaMemcpy( d_pmMat, pmMat, sizeof(float)*m_rows*m_cols, cudaMemcpyHostToDevice);
    
            //Additional Variables for kernels
            float scaleFactor = 0.0021;
            int minDistance = -10;
    
            //Run kernel! //cudaSimpleMult( float *dst, float *src, float *M, int width, int height)
            int blocks = src_rows;
            const int numTests = iter;
            double perfStart = getMillis();
    
            for(int i = 0; i < numTests; i++)
            {           
                //cudaSimpleMult<<<blocks,1>>>(d_pdstMat, d_psrcMat, d_pmMat, src_cols, src_rows);
                cudaCalcXYZ<<<blocks,1>>>(d_pdstMat, d_psrcMat, d_pmMat, src_rows, src_cols, scaleFactor, minDistance);
                cudaDeviceSynchronize();
            }
            double perfStop = getMillis();
            double perfDelta = perfStop - perfStart;
            cout << "Ran " << numTests << " iterations totaling " << perfDelta << "ms" << endl;
            cout << " Average time per iteration: " << (perfDelta/(float)numTests) << "ms" << endl;
    
            //Copy result back to host
            //cudaMemcpy(pdstMat, d_pdstMat, sizeof(float)*src_rows*src_cols, cudaMemcpyDeviceToHost);
            //cout << "Printing results" << endl;
            //for(int i = 0; i < 16*16; i++)
            //{
            //  cout << "src[" << i << "]= " << psrcMat[i] << " dst[" << i << "]= " << pdstMat[i] << endl;
            //}
    
            cudaFree(d_psrcMat);
            cudaFree(d_pmMat);
            cudaFree(d_pdstMat);
            cudaFreeHost(psrcMat);
            cudaFreeHost(pmMat);
            cudaFreeHost(pdstMat);
    }
    
    void runCudaTestDeviceCopy(int iter, int cols, int rows)
    {
            cout << "CUDA Test::DEVICE COPY" << endl;
            int src_rows = rows;
            int src_cols = cols;
            int m_rows = 4;
            int m_cols = 4;
            int dst_rows = src_rows;
            int dst_cols = src_cols;
            //Create and allocate memory for host mats pointers
            float *psrcMat;
            float *pmMat;
            float *pdstMat;
            cudaHostAlloc((void **)&psrcMat, src_rows*src_cols*sizeof(float), cudaHostAllocMapped);
            cudaHostAlloc((void **)&pmMat, m_rows*m_cols*sizeof(float), cudaHostAllocMapped);
            cudaHostAlloc((void **)&pdstMat, dst_rows*dst_cols*sizeof(float), cudaHostAllocMapped);
            //Create pointers to dev mats
            float *d_psrcMat;
            float *d_pmMat;
            float *d_pdstMat;
            cudaMalloc( (void **)&d_psrcMat, sizeof(float)*src_rows*src_cols ); 
            cudaMalloc( (void **)&d_pdstMat, sizeof(float)*src_rows*src_cols );
            cudaMalloc( (void **)&d_pmMat, sizeof(float)*4*4 ); //4x4 matrix
            //Create mats using host pointers
            Mat src_mat = Mat(cvSize(src_cols, src_rows), CV_32FC1, psrcMat);
            Mat m_mat   = Mat(cvSize(m_cols, m_rows), CV_32FC1, pmMat);
            Mat dst_mat = Mat(cvSize(dst_cols, dst_rows), CV_32FC1, pdstMat);
    
            //configure src and m mats
            for(int i = 0; i < src_rows*src_cols; i++)
            {
                psrcMat[i] = (float)i;
            }
            for(int i = 0; i < m_rows*m_cols; i++)
            {
                pmMat[i] = 0.1234;
            }
    
            //Additional Variables for kernels
            float scaleFactor = 0.0021;
            int minDistance = -10;
    
            //Run kernel! //cudaSimpleMult( float *dst, float *src, float *M, int width, int height)
            int blocks = src_rows;
    
            double perfStart = getMillis();
            for(int i = 0; i < iter; i++)
            {           
                //Copty from host to device
                cudaMemcpy( d_psrcMat, psrcMat, sizeof(float)*src_rows*src_cols, cudaMemcpyHostToDevice);
                cudaMemcpy( d_pmMat, pmMat, sizeof(float)*m_rows*m_cols, cudaMemcpyHostToDevice);
                //Run Kernel
                //cudaSimpleMult<<<blocks,1>>>(d_pdstMat, d_psrcMat, d_pmMat, src_cols, src_rows);
                cudaCalcXYZ<<<blocks,1>>>(d_pdstMat, d_psrcMat, d_pmMat, src_rows, src_cols, scaleFactor, minDistance);
                //Copy from device to host
                cudaMemcpy( pdstMat, d_pdstMat, sizeof(float)*src_rows*src_cols, cudaMemcpyDeviceToHost);
            }
            double perfStop = getMillis();
            double perfDelta = perfStop - perfStart;
            cout << "Ran " << iter << " iterations totaling " << perfDelta << "ms" << endl;
            cout << " Average time per iteration: " << (perfDelta/(float)iter) << "ms" << endl;
    
            cudaFree(d_psrcMat);
            cudaFree(d_pmMat);
            cudaFree(d_pdstMat);
            cudaFreeHost(psrcMat);
            cudaFreeHost(pmMat);
            cudaFreeHost(pdstMat);
    }
    
    //Timing functions for performance measurements
    double getMicros()
    {
        timespec ts;
        //double t_ns, t_s;
        long t_ns;
        double t_s;
        clock_gettime(CLOCK_MONOTONIC, &ts);
        t_s = (double)ts.tv_sec;
        t_ns = ts.tv_nsec;
        //return( (t_s *1000.0 * 1000.0) + (double)(t_ns / 1000.0) );
        return ((double)t_ns / 1000.0);
    }
    
    double getMillis()
    {
        timespec ts;
        double t_ns, t_s;
        clock_gettime(CLOCK_MONOTONIC, &ts);
        t_s = (double)ts.tv_sec;
        t_ns = (double)ts.tv_nsec;
        return( (t_s * 1000.0) + (t_ns / 1000000.0) );
    }
    

    我已经看过帖子Cuda zero-copy performance,但我认为这与以下原因无关:GPU和CPU具有物理统一的内存架构。

    由于

2 个答案:

答案 0 :(得分:1)

当您使用ZeroCopy时,对内存的读取会经过一些路径,在该路径中,它会查询内存单元以从系统内存中获取数据。此操作有一些延迟。

当使用直接访问内存时,内存单元从全局内存中收集数据,并具有不同的访问模式和延迟。

实际上看到这种差异需要进行某种形式的分析。

尽管如此,您对全局函数的调用使用了单个线程

cudaCalcXYZ<<< blocks,1 >>> (...

在这种情况下,当从系统内存(或全局内存)收集内存时,GPU几乎无法隐藏延迟。我建议你使用更多的线程(64的一些,总共至少128),并在其上运行探查器以获得内存访问的成本。您的算法似乎是可分离的,并修改了

中的代码
for(int i= 0; i < width; i++)

for (int i = threadIdx.x ; i < width ; i += blockDim.x)

可能会提高整体表现。 图像大小为640,将变为128个线程的5次迭代。

cudaCalcXYZ<<< blocks,128 >>> (...

我相信这会带来一些性能提升。

答案 1 :(得分:1)

ZeroCopy功能允许我们在设备上运行数据,而无需手动将其复制到设备内存,如cudaMemcpy功能。零拷贝存储器仅将主机地址传递给在内核设备上读/写的设备。因此,您向内核设备声明的线程块越多,在内核设备上读取/写入的数据越多,传递给设备的主机地址就越多。最后,与仅向设备内核声明一些线程块相比,您获得了更好的性能提升。