我有以下两种计算复杂内部产品的实现,一种使用在CPU上运行的STL库,另一种使用在GPU上运行的Thrust:
CPU实施
#include <vector>
#include <numeric>
#include <complex>
int main(int argc, char **argv)
{
int vec_size = atoi(argv[1]);
std::vector< std::complex<float> > host_x( vec_size );
std::generate(host_x.begin(), host_x.end(), std::rand);
std::vector< std::complex<float> > host_y( vec_size );
std::generate(host_y.begin(), host_y.end(), std::rand);
std::complex<float> z = std::inner_product(host_x.begin(), host_x.end(), host_y.begin(), std::complex<float>(0.0f,0.0f) );
return 0;
}
GPU实施
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <thrust/complex.h>
int main(int argc, char **argv)
{
int vec_size = atoi(argv[1]);
thrust::host_vector< thrust::complex<float> > host_x( vec_size );
thrust::generate(host_x.begin(), host_x.end(), rand);
thrust::host_vector< thrust::complex<float> > host_y( vec_size );
thrust::generate(host_y.begin(), host_y.end(), rand);
thrust::device_vector< thrust::complex<float> > device_x = host_x;
thrust::device_vector< thrust::complex<float> > device_y = host_y;
thrust::complex<float> z = thrust::inner_product(device_x.begin(), device_x.end(), device_y.begin(), thrust::complex<float>(0.0f,0.0f) );
return 0;
}
我使用g ++编译CPU实现,使用mvcc编译GPU实现。两者都有-O3优化。我在向量中运行了3,000,000个元素的两个实现,并得到以下时序结果:
CPU:
真实的0m0.159s
用户0m0.100s
sys 0m0.048s
GPU:
真实0m0.284s
用户0m0.190s
sys 0m0.083s
我使用以下软件:
$ gcc -v
Configured with: --prefix=/Applications/Xcode.app/Contents/Developer/usr --with-gxx-include-dir=/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk/usr/include/c++/4.2.1
Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn)
Target: x86_64-apple-darwin13.3.0
Thread model: posix
$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2013 NVIDIA Corporation
Built on Thu_Sep__5_10:17:14_PDT_2013
Cuda compilation tools, release 5.5, V5.5.0
与GitHub回购的最新版本的Thrust一起。
我的CPU是2.4 GHz Intel Core 2 Duo,我的GPU是NVIDIA GeForce 320M 256 MB。
问题: 我是Thrust使用的新手,但是我的GPU实现不应该比我的CPU实现快得多吗?我意识到GPU存在内存交易成本,但我想我是在试图弄清楚我是否正确使用Thrust来执行GPU上的内部产品,因为我认为时间结果出乎意料地被逆转了
修改: 根据每个人的建议,我可以配置迭代次数并更改时间的粒度,如下所示:
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <thrust/complex.h>
#include <thrust/execution_policy.h>
int main(int argc, char **argv)
{
int vec_size = atoi(argv[1]);
int iterations = atoi(argv[2]);
float milliseconds = 0;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
thrust::host_vector< thrust::complex<float> > host_x( vec_size );
thrust::generate(host_x.begin(), host_x.end(), rand);
thrust::host_vector< thrust::complex<float> > host_y( vec_size );
thrust::generate(host_y.begin(), host_y.end(), rand);
printf("vector size = %lu bytes\n", vec_size * sizeof(thrust::complex<float>));
cudaEventRecord(start);
thrust::device_vector< thrust::complex<float> > device_x = host_x;
thrust::device_vector< thrust::complex<float> > device_y = host_y;
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("copy (device)\t\t%f ms\n", milliseconds);
cudaEventRecord(start);
for(int i = 0; i < iterations; ++i)
{
thrust::inner_product(thrust::cuda::par, device_x.begin(), device_x.end(), device_y.begin(), thrust::complex<float>(0.0f,0.0f) );
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("inner_product (device)\t%f ms\n", milliseconds/iterations);
cudaEventRecord(start);
for(int i = 0; i < iterations; ++i)
{
thrust::inner_product(thrust::host, host_x.begin(), host_x.end(), host_y.begin(), thrust::complex<float>(0.0f,0.0f) );
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("inner_product (host)\t%f ms\n", milliseconds/iterations);
return 0;
}
在Tegra K1上,我得到了以下内容:
$ nvcc complex_inner_product.cu -O3 -arch=sm_32 -o cip
$ ./cip 3100000 1000
vector size = 24800000 bytes
copy (device) 45.741653 ms
inner_product (device) 10.595121 ms
inner_product (host) 1.807912 ms
在Intel Core 2 Duo 2.4 GHz和GeForce 320M上,我得到了以下结果:
$ nvcc complex_inner_product.cu -O3 -arch=sm_12 -o cip
$ ./cip 3100000 1000
vector size = 24800000 bytes
copy (device) 227.765213 ms
inner_product (device) 42.180416 ms
inner_product (host) 0.000018 ms
在Intel Core i5 3.3 GHz和GeForce GT 755M上:
$ nvcc complex_inner_product.cu -O3 -arch=sm_30 -o cip
$ ./cip 3100000 1000
vector size = 24800000 bytes
copy (device) 22.930016 ms
inner_product (device) 6.249663 ms
inner_product (host) 0.000003 ms
因此,无论我使用何种计算能力或硬件,主机处理器至少比GPU快10倍。有什么想法吗?
答案 0 :(得分:4)
您的基准测试方法需要考虑很多事项。我不是在争论你的结果是否有效;根据您认为重要的事情,这是一个意见问题。但有些事情需要考虑:
如果你只是计算部分的时间,我希望你会发现GPU看起来好一点。这是一个完全有效的例子:
$ cat t489.cu
#include <vector>
#include <numeric>
#include <complex>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <thrust/complex.h>
#include <time.h>
#include <sys/time.h>
#include <iostream>
int main(int argc, char **argv)
{
timeval tv1, tv2;
int vec_size = atoi(argv[1]);
std::vector< std::complex<float> > cpu_x( vec_size );
std::generate(cpu_x.begin(), cpu_x.end(), std::rand);
std::vector< std::complex<float> > cpu_y( vec_size );
std::generate(cpu_y.begin(), cpu_y.end(), std::rand);
gettimeofday(&tv1, 0);
std::complex<float> cpu_z = std::inner_product(cpu_x.begin(), cpu_x.end(), cpu_y.begin(), std::complex<float>(0.0f,0.0f) );
gettimeofday(&tv2, 0);
std::cout <<"CPU result: " << cpu_z.real() << "," << cpu_z.imag() << std::endl;
unsigned t2 = (tv2.tv_sec*1000000) + tv2.tv_usec;
unsigned t1 = (tv1.tv_sec*1000000) + tv1.tv_usec;
float et = (t2-t1)/(float) 1000;
std::cout << "CPU elapsed time: " << et << "ms" << std::endl;
thrust::host_vector< thrust::complex<float> > host_x( vec_size );
thrust::generate(host_x.begin(), host_x.end(), rand);
thrust::host_vector< thrust::complex<float> > host_y( vec_size );
thrust::generate(host_y.begin(), host_y.end(), rand);
thrust::device_vector< thrust::complex<float> > device_x = host_x;
thrust::device_vector< thrust::complex<float> > device_y = host_y;
gettimeofday(&tv1, 0);
thrust::complex<float> z = thrust::inner_product(device_x.begin(), device_x.end(), device_y.begin(), thrust::complex<float>(0.0f,0.0f) );
gettimeofday(&tv2, 0);
std::cout <<"GPU result: " << z.real() << "," << z.imag() << std::endl;
t2 = (tv2.tv_sec*1000000) + tv2.tv_usec;
t1 = (tv1.tv_sec*1000000) + tv1.tv_usec;
et = (t2-t1)/(float) 1000;
std::cout << "GPU elapsed time: " << et << "ms" << std::endl;
return 0;
}
$ nvcc -arch=sm_20 -O3 -o t489 t489.cu
$ ./t489 3000000
CPU result: 3.45238e+24,0
CPU elapsed time: 19.294ms
GPU result: 3.46041e+24,0
GPU elapsed time: 3.426ms
$
这是使用Quadro5000 GPU(比GT320M强大得多),RHEL 5.5,CUDA 6.5RC,Thrust 1.8(主分支)
那么哪些数字很重要?随你(由你决定。如果您只是打算在GPU上执行此单个内部产品,而不是GPU上的其他计算或任何活动,那么使用GPU将毫无意义。但是在更大问题的背景下,内部产品只是其中的一部分,GPU可能比CPU更快。
(结果不匹配,因为程序在每种情况下都会产生不同的起始值。)