在N点之间的距离计算中,令人失望的结果是pyCUDA基准

时间:2012-12-30 19:32:03

标签: python cuda scipy pycuda

以下脚本是为基准目的而设置的。它使用欧几里德L2范数计算N个点之间的距离。实施了三个不同的例程:

  1. 使用scipy.spatial.distance.pdist函数的高级解决方案。
  2. 相当低级别的OpenMP驱动的scipy.weave.inline解决方案。
  3. pyCUDA支持的GPGPU解决方案。
  4. 以下是使用GTX660(2GB RAM)的i5-3470(16GB RAM)的基准测试结果:

        ------------
        Scipy Pdist
        Execution time: 3.01975 s
        Frist five elements: [ 0.74968684  0.71457213  0.833188    0.48084545  0.86407363]
        Last five elements: [ 0.65717077  0.76850474  0.29652017  0.856179    0.56074625]
        ------------
        Weave Inline
        Execution time: 2.48705 s
        Frist five elements: [ 0.74968684  0.71457213  0.83318806  0.48084542  0.86407363]
        Last five elements: [ 0.65717083  0.76850474  0.29652017  0.856179    0.56074625]
        ------------
        pyCUDA
        CUDA clock timing:  0.713028930664
        Execution time: 2.04364 s
        Frist five elements: [ 0.74968684  0.71457213  0.83318806  0.48084542  0.86407363]
        Last five elements: [ 0.65717083  0.76850468  0.29652017  0.856179    0.56074625]
        ------------
    

    我对pyCUDA性能有点失望。由于我是CUDA的新手,我可能在这里缺少一些东西。那么问题的关键在哪里?我是否达到了全局内存带宽的限制?块和网格的选择不好?

    import numpy,time,math
    import pycuda.autoinit
    import pycuda.driver as drv
    from pycuda.compiler import SourceModule
    from scipy.spatial.distance import pdist
    from scipy import weave
    
    def weave_solution(x):
        """
        OpenMP powered weave inline.
        """
        N,DIM     = numpy.shape(x)
        L         = ((N-1)**2+(N-1))/2
        solution  = numpy.zeros(L).astype(numpy.float32)
        ncpu      = 4
        weave_omp = {'headers'           : ['<omp.h>'],
                     'extra_compile_args': ['-fopenmp'],
                     'extra_link_args'   : ['-lgomp']}
        code = \
             r'''
             omp_set_num_threads(ncpu);
             #pragma omp parallel
             {            
                int j,d,pos;
                float r=0.0;
    
                #pragma omp for
                   for (int i=0; i<(N-1); i++){
                      for (j=(i+1); j<N; j++){
                         r = 0.0;
                         for (d=0; d<DIM; d++){
                            r += (x[i*DIM+d]-x[j*DIM+d])*(x[i*DIM+d]-x[j*DIM+d]);
                         }
                         pos = (i*N+j)-(i*(i+1)/2)-i-1;
                         solution[pos] = sqrt(r);
                      }
                   }
    
             }
             '''
        weave.inline(code,['x','N','DIM','solution','ncpu'],**weave_omp)
        return numpy.array(solution)
    
    def scipy_solution(x):
        """
        SciPy High-level function
        """
        return pdist(x).astype(numpy.float32)
    
    def cuda_solution(x):
        """
        pyCUDA
        """
        N,DIM     = numpy.shape(x)
        N         = numpy.int32(N)
        DIM       = numpy.int32(DIM)    
        L         = ((N-1)**2+(N-1))/2
        solution  = numpy.zeros(L).astype(numpy.float32)
    
        start = drv.Event()
        end   = drv.Event()       
    
        mod = SourceModule("""
        __global__ void distance(float *x,int N,int DIM,float *solution){
    
        const int i = blockDim.x * blockIdx.x + threadIdx.x;
    
        int j,d,pos;
        float r=0.0;
    
        if ( i < (N-1) ){
    
           for (j=(i+1); j<N; j++){
    
              r = 0.0;
              for (d=0; d<DIM; d++){
                 r += (x[i*DIM+d]-x[j*DIM+d])*(x[i*DIM+d]-x[j*DIM+d]);
              }
              pos = (i*N+j)-(i*(i+1)/2)-i-1;
              solution[pos] = sqrt(r);
           }
    
        }
        }
        """)
    
    
        func = mod.get_function("distance")
    
        start.record()
        func(drv.In(x),N,DIM,drv.Out(solution),block=(192,1,1),grid=(192,1))
        end.record()
        end.synchronize()
        secs = start.time_till(end)*1e-3
    
        print "CUDA clock timing: ",secs
        return solution
    
    if __name__ == '__main__':
    
        # Set up data points
        N   = 25000
        DIM = 3
        x   = numpy.random.rand(N,DIM).astype(numpy.float32)
    
        print "-"*12
        # Scipy solution
        print "Scipy Pdist"
        stime = time.time()
        spsolution = scipy_solution(x)
        stime = time.time()-stime
        print "Execution time: {0:.5f} s".format(stime)
        print "Frist five elements:", spsolution[:5]
        print "Last five elements:", spsolution[-5:]    
        print "-"*12
    
        # Weave solution
        print "Weave Inline"
        wtime = time.time()
        wsolution = weave_solution(x)
        wtime = time.time()-wtime
        print "Execution time: {0:.5f} s".format(wtime)
        print "Frist five elements:", wsolution[:5]
        print "Last five elements:", wsolution[-5:]
        print "-"*12
    
        # pyCUDA solution
        print "pyCUDA"
        ctime = time.time()
        csolution = cuda_solution(x)
        ctime = time.time()-ctime
        print "Execution time: {0:.5f} s".format(ctime)
        print "Frist five elements:", csolution[:5]
        print "Last five elements:", csolution[-5:]    
        print "-"*12
    

    编辑:

    我添加了哈希爆炸线

    #!/usr/bin/env python
    

    位于文件顶部并使其可执行。使用weave.inlinescipy.spatial.distance.pdist注释掉计算后,NVIDIA Visual Profiler会提供以下结果:

    NVIDIA Visual Profiler

1 个答案:

答案 0 :(得分:4)

现在你有192个线程,每个更新N-1个位置,你可以轻松启动更多的块/线程。

你要做的不是这个循环for (j=(i+1); j<N; j++){,而是用N-1线程代替内循环。

如果你想进一步发展,你可以让N-1 * DIM个线程在内部循环中执行语句,将结果存储到共享内存中,最后对其进行减少。见Optimizing Parallel Reduction in CUDA

看看这一行:

r += (x[i*DIM+d]-x[j*DIM+d])*(x[i*DIM+d]-x[j*DIM+d]);

内存事务和模式不统一并且合并。另外不知道nvcc是否会将你的表达式优化为只有两个内存事务而不是这里显示的四个,因为我不知道pycuda是否将-O3传递给了nvcc。将(x[i*DIM+d]-x[j*DIM+d])放入寄存器变量中以确保并自己对齐。

否则,您也可以尝试在每个for循环之前放置#pragma unroll,以便在可能的情况下展开它们。