在GPU上运行的推力复杂内部产品比在CPU上执行STL更慢

时间:2014-07-18 20:58:02

标签: c++ stl cuda gpu thrust

我有以下两种计算复杂内部产品的实现,一种使用在CPU上运行的STL库,另一种使用在GPU上运行的Thrust:

CPU实施

#include <vector>
#include <numeric>
#include <complex>

int main(int argc, char **argv)
{
    int vec_size = atoi(argv[1]);

    std::vector< std::complex<float> > host_x( vec_size );
    std::generate(host_x.begin(), host_x.end(), std::rand);

    std::vector< std::complex<float> > host_y( vec_size );
    std::generate(host_y.begin(), host_y.end(), std::rand);

    std::complex<float> z = std::inner_product(host_x.begin(), host_x.end(), host_y.begin(), std::complex<float>(0.0f,0.0f) );

    return 0;
}

GPU实施

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <thrust/complex.h>

int main(int argc, char **argv)
{
    int vec_size = atoi(argv[1]);

    thrust::host_vector< thrust::complex<float> > host_x( vec_size );
    thrust::generate(host_x.begin(), host_x.end(), rand);

    thrust::host_vector< thrust::complex<float> > host_y( vec_size );
    thrust::generate(host_y.begin(), host_y.end(), rand);

    thrust::device_vector< thrust::complex<float> > device_x = host_x;
    thrust::device_vector< thrust::complex<float> > device_y = host_y;

    thrust::complex<float> z = thrust::inner_product(device_x.begin(), device_x.end(), device_y.begin(), thrust::complex<float>(0.0f,0.0f) );

    return 0;
}

我使用g ++编译CPU实现,使用mvcc编译GPU实现。两者都有-O3优化。我在向量中运行了3,000,000个元素的两个实现,并得到以下时序结果:

CPU:
真实的0m0.159s
用户0m0.100s
sys 0m0.048s

GPU:
真实0m0.284s
用户0m0.190s
sys 0m0.083s

我使用以下软件:

$ gcc -v
Configured with: --prefix=/Applications/Xcode.app/Contents/Developer/usr --with-gxx-include-dir=/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk/usr/include/c++/4.2.1
Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn)
Target: x86_64-apple-darwin13.3.0
Thread model: posix

$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2013 NVIDIA Corporation
Built on Thu_Sep__5_10:17:14_PDT_2013
Cuda compilation tools, release 5.5, V5.5.0

与GitHub回购的最新版本的Thrust一起。

我的CPU是2.4 GHz Intel Core 2 Duo,我的GPU是NVIDIA GeForce 320M 256 MB。

问题: 我是Thrust使用的新手,但是我的GPU实现不应该比我的CPU实现快得多吗?我意识到GPU存在内存交易成本,但我想我是在试图弄清楚我是否正确使用Thrust来执行GPU上的内部产品,因为我认为时间结果出乎意料地被逆转了

修改: 根据每个人的建议,我可以配置迭代次数并更改时间的粒度,如下所示:

#include <stdio.h>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <thrust/complex.h>
#include <thrust/execution_policy.h>

int main(int argc, char **argv)
{
    int vec_size = atoi(argv[1]);
    int iterations = atoi(argv[2]);

    float milliseconds = 0;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    thrust::host_vector< thrust::complex<float> > host_x( vec_size );
    thrust::generate(host_x.begin(), host_x.end(), rand);

    thrust::host_vector< thrust::complex<float> > host_y( vec_size );
    thrust::generate(host_y.begin(), host_y.end(), rand);

    printf("vector size = %lu bytes\n", vec_size * sizeof(thrust::complex<float>)); 

    cudaEventRecord(start);

    thrust::device_vector< thrust::complex<float> > device_x = host_x;
    thrust::device_vector< thrust::complex<float> > device_y = host_y;
    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("copy (device)\t\t%f ms\n", milliseconds);

    cudaEventRecord(start);

    for(int i = 0; i < iterations; ++i)
    {
        thrust::inner_product(thrust::cuda::par, device_x.begin(), device_x.end(), device_y.begin(), thrust::complex<float>(0.0f,0.0f) );
    }

    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("inner_product (device)\t%f ms\n", milliseconds/iterations); 

    cudaEventRecord(start);

    for(int i = 0; i < iterations; ++i)
    {
        thrust::inner_product(thrust::host, host_x.begin(), host_x.end(), host_y.begin(), thrust::complex<float>(0.0f,0.0f) );
    }

    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("inner_product (host)\t%f ms\n", milliseconds/iterations);   

    return 0;
}

在Tegra K1上,我得到了以下内容:

$ nvcc complex_inner_product.cu -O3 -arch=sm_32 -o cip
$ ./cip 3100000 1000
vector size = 24800000 bytes
copy (device)       45.741653 ms
inner_product (device)  10.595121 ms
inner_product (host)    1.807912 ms

在Intel Core 2 Duo 2.4 GHz和GeForce 320M上,我得到了以下结果:

$ nvcc complex_inner_product.cu -O3 -arch=sm_12 -o cip
$ ./cip 3100000 1000
vector size = 24800000 bytes
copy (device)       227.765213 ms
inner_product (device)  42.180416 ms
inner_product (host)    0.000018 ms

在Intel Core i5 3.3 GHz和GeForce GT 755M上:

$ nvcc complex_inner_product.cu -O3 -arch=sm_30 -o cip
$ ./cip 3100000 1000
vector size = 24800000 bytes
copy (device)       22.930016 ms
inner_product (device)  6.249663 ms
inner_product (host)    0.000003 ms

因此,无论我使用何种计算能力或硬件,主机处理器至少比GPU快10倍。有什么想法吗?

1 个答案:

答案 0 :(得分:4)

您的基准测试方法需要考虑很多事项。我不是在争论你的结果是否有效;根据您认为重要的事情,这是一个意见问题。但有些事情需要考虑:

  1. CUDA启动时间包含在您的测量中。
  2. 数据传输时间包含在您的测量中。
  3. 您只进行一次测量通过。
  4. 您使用的是非常低端的GPU。
  5. 您选择要测试的功能并不是非常耗费计算量(每个浮点数量只需几次触发)。
  6. 如果你只是计算部分的时间,我希望你会发现GPU看起来好一点。这是一个完全有效的例子:

    $ cat t489.cu
    #include <vector>
    #include <numeric>
    #include <complex>
    #include <thrust/host_vector.h>
    #include <thrust/device_vector.h>
    #include <thrust/inner_product.h>
    #include <thrust/complex.h>
    #include <time.h>
    #include <sys/time.h>
    #include <iostream>
    
    int main(int argc, char **argv)
    {
        timeval tv1, tv2;
        int vec_size = atoi(argv[1]);
    
        std::vector< std::complex<float> > cpu_x( vec_size );
        std::generate(cpu_x.begin(), cpu_x.end(), std::rand);
    
        std::vector< std::complex<float> > cpu_y( vec_size );
        std::generate(cpu_y.begin(), cpu_y.end(), std::rand);
    
        gettimeofday(&tv1, 0);
        std::complex<float> cpu_z = std::inner_product(cpu_x.begin(), cpu_x.end(), cpu_y.begin(), std::complex<float>(0.0f,0.0f) );
        gettimeofday(&tv2, 0);
        std::cout <<"CPU result: " << cpu_z.real() << "," << cpu_z.imag() << std::endl;
        unsigned t2 = (tv2.tv_sec*1000000) + tv2.tv_usec;
        unsigned t1 = (tv1.tv_sec*1000000) + tv1.tv_usec;
        float et = (t2-t1)/(float) 1000;
        std::cout << "CPU elapsed time: " << et << "ms" << std::endl;
        thrust::host_vector< thrust::complex<float> > host_x( vec_size );
        thrust::generate(host_x.begin(), host_x.end(), rand);
    
        thrust::host_vector< thrust::complex<float> > host_y( vec_size );
        thrust::generate(host_y.begin(), host_y.end(), rand);
    
        thrust::device_vector< thrust::complex<float> > device_x = host_x;
        thrust::device_vector< thrust::complex<float> > device_y = host_y;
    
        gettimeofday(&tv1, 0);
        thrust::complex<float> z = thrust::inner_product(device_x.begin(), device_x.end(), device_y.begin(), thrust::complex<float>(0.0f,0.0f) );
        gettimeofday(&tv2, 0);
        std::cout <<"GPU result: " << z.real() << "," << z.imag() << std::endl;
        t2 = (tv2.tv_sec*1000000) + tv2.tv_usec;
        t1 = (tv1.tv_sec*1000000) + tv1.tv_usec;
        et = (t2-t1)/(float) 1000;
        std::cout << "GPU elapsed time: " << et << "ms" << std::endl;
    
        return 0;
    }
    $ nvcc -arch=sm_20 -O3 -o t489 t489.cu
    $ ./t489 3000000
    CPU result: 3.45238e+24,0
    CPU elapsed time: 19.294ms
    GPU result: 3.46041e+24,0
    GPU elapsed time: 3.426ms
    $
    

    这是使用Quadro5000 GPU(比GT320M强大得多),RHEL 5.5,CUDA 6.5RC,Thrust 1.8(主分支)

    那么哪些数字很重要?随你(由你决定。如果您只是打算在GPU上执行此单个内部产品,而不是GPU上的其他计算或任何活动,那么使用GPU将毫无意义。但是在更大问题的背景下,内部产品只是其中的一部分,GPU可能比CPU更快。

    (结果不匹配,因为程序在每种情况下都会产生不同的起始值。)