我试图在一个代码中比较推力,CUB和ModernGPU(MGPU)库中的reduce_by_key,仅按推力和CUB上的previous post行。目的不是为了确定基准,而是为了确保我能正确使用它们。
thrust::reduce_by_key
和cub::DeviceReduce::ReduceByKey
来电可以很好地协同工作,而thrust::reduce_by_key
和ModernGPU ReduceByKey
调用可以很好地协同工作,但是当我在调用MGPU后调用CUB时,CUB停止工作。 cuda-memcheck说我的MGPU代码有错误,但很难找到,因为我只有一个MGPU函数调用,错误是非致命的,并且MGPU调用继续以获得正确的结果!
完整的可编辑代码和cuda-memcheck输出进一步下载,但首先澄清:
按照以下顺序发表声明,所有结果都是正确的(尽管cuda-memcheck抱怨)
test_thrust(Nkey,Nseg,x,key,output,keysout);
test_cub(Nkey,Nseg,x,key,output,keysout);
test_mgpu(argc,argv,Nkey,Nseg,x,key,output,keysout);
但是按照这个顺序的语句,CUB结果是错误的:
test_thrust(Nkey,Nseg,x,key,output,keysout);
test_mgpu(argc,argv,Nkey,Nseg,x,key,output,keysout);
test_cub(Nkey,Nseg,x,key,output,keysout);
其中“CUB结果错误”意味着cub::DeviceReduce::ReduceByKey
例程根本不更新其输出参数。
这是使用CUDA 6.5,CUB 1.3.2,以及通过按下github上的“download zip”按钮下载的最新ModernGPU - 也许是版本1.1。可以使用以下库编译以下代码:
nvcc $MGPU/src/mgpucontext.cu $MGPU/src/mmio.cpp $MGPU/src/mgpuutil.cpp test_reduce_thrust_MGPU.cu -I $CUB/ -I $MGPU/include
以下是代码:
#include <iostream>
#include <string>
// THRUST:
#include <thrust/sort.h>
#include <thrust/device_vector.h>
// CUB:
#include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh>
// Modern GPU (MGPU)
#include "kernels/reducebykey.cuh"
//========================================
// for CUB:
struct CustomSum
{
template <typename T>
CUB_RUNTIME_FUNCTION __device__ __forceinline__
//__host__ __device__ __forceinline__
T operator()(const T &a, const T &b) const {
return b+a;
}
};
//========================================
void show_vecs(const std::string& title,
int Nkey,int Nseg,
const thrust::device_vector<float>& x,
const thrust::device_vector<int>& key,
thrust::device_vector<float>& output,
thrust::device_vector<int>& keysout,
bool check){
std::cout << title << std::endl;
for (int i=0;i<Nkey;i++) std::cout << x[i] <<" "; std::cout<<std::endl;
for (int i=0;i<Nkey;i++) std::cout << key[i] <<" "; std::cout<<std::endl;
for (int i=0;i<Nkey;i++) std::cout << keysout[i] <<" "; std::cout<<std::endl;
for (int i=0;i<Nseg;i++) std::cout << output[i] <<" "; std::cout<<std::endl;
if (check){
std::cout << "total=" << thrust::reduce(output.begin(),output.end())
<<", should be " << thrust::reduce(x.begin(),x.end()) <<std::endl;
}
}
void test_thrust(int Nkey,int Nseg,
const thrust::device_vector<float>& x,
const thrust::device_vector<int>& key,
thrust::device_vector<float>& output,
thrust::device_vector<int>& keysout){
std::cout<<"=================================================================="<<std::endl
<<" THRUST reduce_by_key:"<<std::endl
<<"=================================================================="<<std::endl;
// reset output:
thrust::fill(keysout.begin(), keysout.end(), 0);
thrust::fill(output.begin(), output.end(), 0.0f);
show_vecs("Thrust input",Nkey,Nseg,x,key,output,keysout,0);
thrust::reduce_by_key(key.begin(),
key.end(),
x.begin(),
keysout.begin(),
output.begin());
show_vecs("Thrust output",Nkey,Nseg,x,key,output,keysout,1);
}
void test_cub(int Nkey,int Nseg,
thrust::device_vector<float>& x,
thrust::device_vector<int>& key,
thrust::device_vector<float>& output,
thrust::device_vector<int>& keysout){
std::cout<<"=================================================================="<<std::endl
<<" CUB ReduceByKey:"<<std::endl
<<"=================================================================="<<std::endl;
// reset output:
thrust::fill(keysout.begin(), keysout.end(), 0);
thrust::fill(output.begin(), output.end(), 0.0f);
show_vecs("CUB input",Nkey,Nseg,x,key,output,keysout,0);
int *cub_keys_in =thrust::raw_pointer_cast(&key[0]);
int *cub_keys_out=thrust::raw_pointer_cast(&keysout[0]);
float *cub_val_in =thrust::raw_pointer_cast(&x[0]);
float *cub_val_out =thrust::raw_pointer_cast(&output[0]);
int *d_num_segments;
cudaMalloc(&d_num_segments, sizeof(int));
cudaMemcpy(d_num_segments,&Nseg,sizeof(int),cudaMemcpyHostToDevice);
CustomSum reduction_op;
// Reduce using temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes,
cub_keys_in, cub_keys_out,
cub_val_in, cub_val_out,
d_num_segments, reduction_op, Nkey);
cudaDeviceSynchronize();
cudaMalloc(&d_temp_storage, temp_storage_bytes);
std::cout << "temp_storage_bytes = " << temp_storage_bytes << std::endl;
cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes,
cub_keys_in, cub_keys_out,
cub_val_in, cub_val_out,
d_num_segments, reduction_op, Nkey);
cudaDeviceSynchronize();
cudaFree(d_temp_storage);
show_vecs("CUB output",Nkey,Nseg,x,key,output,keysout,1);
}
void test_mgpu(int argc, char** argv,
int Nkey,int Nseg,
thrust::device_vector<float>& x,
thrust::device_vector<int>& key,
thrust::device_vector<float>& output,
thrust::device_vector<int>& keysout){
std::cout<<"=================================================================="<<std::endl
<<" MGPU ReduceByKey:"<<std::endl
<<"=================================================================="<<std::endl;
// reset output:
thrust::fill(keysout.begin(), keysout.end(), 0);
thrust::fill(output.begin(), output.end(), 0.0f);
show_vecs("MGPU input",Nkey,Nseg,x,key,output,keysout,0);
mgpu::ContextPtr context=mgpu::CreateCudaDevice(argc, argv, false);
int *mgpu_keys_in =thrust::raw_pointer_cast(&key[0]);
int *mgpu_keys_out=thrust::raw_pointer_cast(&keysout[0]);
float *mgpu_vals_in =thrust::raw_pointer_cast(&x[0]);
float *mgpu_vals_out=thrust::raw_pointer_cast(&output[0]);
float identity=0.0f;
int count=key.size();
int numSegments,numOut;
ReduceByKey(mgpu_keys_in, mgpu_vals_in, count, identity,
mgpu::plus<float>(), mgpu::equal_to<int>(),
mgpu_keys_out,
mgpu_vals_out,
&numSegments,
&numOut, *context);
// Explicit device allocations made things worse:
// float gident=0.0;
// float *identity; cudaMalloc(&identity, sizeof(float));
// int *numSegments; cudaMalloc(&numSegments, sizeof(int));
// int *numOut; cudaMalloc(&numOut, sizeof(int));
// cudaMemcpy(identity,&gident,sizeof(float),cudaMemcpyHostToDevice);
// ReduceByKey(mgpu_keys_in, mgpu_vals_in, key.size(), gident, //*identity,
// mgpu::plus<float>(), mgpu::equal_to<int>(),
// mgpu_keys_out,
// mgpu_vals_out,
// numSegments,
// numOut, *context);
cudaDeviceSynchronize();
show_vecs("MGPU output",Nkey,Nseg,x,key,output,keysout,1);
}
int main(int argc, char** argv){
// an array of unsorted keys, 9 values total
const int Nkey=20;
int Nseg=9;
int ikey[Nkey] = {0, 0, 0, 6, 8, 0, 2, 4, 6, 8, 1, 3, 5, 7, 8, 1, 3, 5, 7, 8};
thrust::device_vector<int> key(ikey,ikey+Nkey);
thrust::device_vector<int> keysout(Nkey);
// a data vector, x, to be reduced by key
float xval[Nkey];
for (int i=0; i<Nkey; i++) xval[i]=ikey[i]+0.1f;
thrust::device_vector<float> x(xval,xval+Nkey);
// an output for reduced x:
thrust::device_vector<float> output(Nseg,0.0f);
// First, sort key and x, by key
thrust::sort_by_key(key.begin(),key.end(),x.begin());
// Now, the tests:
test_mgpu(argc,argv,Nkey,Nseg,x,key,output,keysout);
test_thrust(Nkey,Nseg,x,key,output,keysout);
test_cub(Nkey,Nseg,x,key,output,keysout);
test_mgpu(argc,argv,Nkey,Nseg,x,key,output,keysout);
return 1;
}
cuda-memcheck输出的前几行是:
========= CUDA-MEMCHECK
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemcpyAsync.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib64/libcuda.so.1 [0x2ef673]
========= Host Frame:./a.out [0x74793]
========= Host Frame:./a.out [0x1c87c]
========= Host Frame:./a.out [0x1ae7e]
========= Host Frame:./a.out [0x190b0]
========= Host Frame:./a.out [0x10974]
========= Host Frame:./a.out [0x10c8c]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed1d]
========= Host Frame:./a.out [0x43b9]
这对我来说都很神秘。
最终我希望尝试使用MGPU ReduceByKeyPreprocess
/ ReduceByKeyApply
对联,但看起来ReduceByKey
调用在某种程度上是错误的。
非常感谢任何帮助,谢谢!