我正在尝试将一些CUB引入我的“旧”Thrust代码,因此我们开始使用一个小例子来比较thrust::reduce_by_key
和cub::DeviceReduce::ReduceByKey
,两者都应用于thrust::device_vectors
。
代码的推力部分很好,但CUB部分,天真地使用通过thrust :: raw_pointer_cast获得的原始指针,在CUB调用后崩溃。我输入cudaDeviceSynchronize()
试图解决这个问题,但没有帮助。代码的CUB部分来自CUB网页。
在OSX上,运行时错误是:
libc++abi.dylib: terminate called throwing an exception
Abort trap: 6
在Linux上,运行时错误是:
terminate called after throwing an instance of 'thrust::system::system_error'
what(): an illegal memory access was encountered
cuda-memcheck的前几行是:
========= CUDA-MEMCHECK
========= Invalid __global__ write of size 4
========= at 0x00127010 in /home/sdettrick/codes/MCthrust/tests/../cub-1.3.2/cub/device/dispatch/../../block_range/block_range_reduce_by_key.cuh:1017:void cub::ReduceByKeyRegionKernel<cub::DeviceReduceByKeyDispatch<unsigned int*, unsigned int*, float*, float*, int*, cub::Equality, CustomSum, int>::PtxReduceByKeyPolicy, unsigned int*, unsigned int*, float*, float*, int*, cub::ReduceByKeyScanTileState<float, int, bool=1>, cub::Equality, CustomSum, int>(unsigned int*, float*, float*, int*, cub::Equality, CustomSum, int, cub::DeviceReduceByKeyDispatch<unsigned int*, unsigned int*, float*, float*, int*, cub::Equality, CustomSum, int>::PtxReduceByKeyPolicy, unsigned int*, int, cub::GridQueue<int>)
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x7fff7dbb3e88 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
不幸的是,我不太清楚该怎么办。
非常感谢任何帮助。我在NVIDIA开发者专区试过这个,但没有得到任何回复。完整的示例代码如下。它应该用CUDA 6.5和cub 1.3.2编译:
#include <iostream>
#include <thrust/sort.h>
#include <thrust/gather.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh>
//========================================
// for CUB:
struct CustomSum
{
template <typename T>
CUB_RUNTIME_FUNCTION __host__ __device__ __forceinline__
//__host__ __device__ __forceinline__
T operator()(const T &a, const T &b) const {
return b+a;
}
};
//========================================
int main()
{
const int Nkey=20;
int Nseg=9;
int ikey[Nkey] = {0, 0, 0, 6, 8, 0, 2, 4, 6, 8, 1, 3, 5, 7, 8, 1, 3, 5, 7, 8};
thrust::device_vector<unsigned int> key(ikey,ikey+Nkey);
thrust::device_vector<unsigned int> keysout(Nkey);
// Let's reduce x, by key:
float xval[Nkey];
for (int i=0; i<Nkey; i++) xval[i]=ikey[i]+0.1f;
thrust::device_vector<float> x(xval,xval+Nkey);
// First, sort x by key:
thrust::sort_by_key(key.begin(),key.end(),x.begin());
//---------------------------------------------------------------------
std::cout<<"=================================================================="<<std::endl
<<" THRUST reduce_by_key:"<<std::endl
<<"=================================================================="<<std::endl;
thrust::device_vector<float> output(Nseg,0.0f);
thrust::reduce_by_key(key.begin(),
key.end(),
x.begin(),
keysout.begin(),
output.begin());
for (int i=0;i<Nkey;i++) std::cout << x[i] <<" "; std::cout<<std::endl;
for (int i=0;i<Nkey;i++) std::cout << key[i] <<" "; std::cout<<std::endl;
for (int i=0;i<Nseg;i++) std::cout << output[i] <<" "; std::cout<<std::endl;
float ototal=thrust::reduce(output.begin(),output.end());
float xtotal=thrust::reduce(x.begin(),x.end());
std::cout << "total="<< ototal <<", should be "<<xtotal<<std::endl;
//---------------------------------------------------------------------
std::cout<<"=================================================================="<<std::endl
<<" CUB ReduceByKey:"<<std::endl
<<"=================================================================="<<std::endl;
unsigned int *d_keys_in =thrust::raw_pointer_cast(&key[0]);
float *d_values_in =thrust::raw_pointer_cast(&x[0]);
unsigned int *d_keys_out =thrust::raw_pointer_cast(&keysout[0]);
float *d_values_out=thrust::raw_pointer_cast(&output[0]);
int *d_num_segments=&Nseg;
CustomSum reduction_op;
std::cout << "CUB input" << std::endl;
for (int i=0; i<Nkey; ++i) std::cout << key[i] << " "; std::cout<<std::endl;
for (int i=0; i<Nkey; ++i) std::cout << x[i] << " "; std::cout<< std::endl;
for (int i=0; i<Nkey; ++i) std::cout << keysout[i] << " "; std::cout<< std::endl;
for (int i=0; i<Nseg; ++i) std::cout << output[i] << " "; std::cout<< std::endl;
// Determine temporary device storage requirements
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_segments, reduction_op, Nkey);
// Allocate temporary storage
cudaMalloc(&d_temp_storage, temp_storage_bytes);
std::cout << "temp_storage_bytes = " << temp_storage_bytes << std::endl;
// Run reduce-by-key
cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_segments, reduction_op, Nkey);
cudaDeviceSynchronize();
std::cout << "CUB output" << std::endl;
std::cout<<Nkey<<" "<<Nseg<<std::endl;
std::cout<<key.size() << " "<<x.size() << " "<<keysout.size() << " "<<output.size() << std::endl;
// At this point onward it dies:
//libc++abi.dylib: terminate called throwing an exception
//Abort trap: 6
// If the next line is uncommented, it crashes the Mac!
for (int i=0; i<Nkey; ++i) std::cout << key[i] << " "; std::cout<<std::endl;
// for (int i=0; i<Nkey; ++i) std::cout << x[i] << " "; std::cout<< std::endl;
// for (int i=0; i<Nkey; ++i) std::cout << keysout[i] << " "; std::cout<< std::endl;
// for (int i=0; i<Nseg; ++i) std::cout << output[i] << " "; std::cout<< std::endl;
cudaFree(d_temp_storage);
ototal=thrust::reduce(output.begin(),output.end());
xtotal=thrust::reduce(x.begin(),x.end());
std::cout << "total="<< ototal <<", should be "<<xtotal<<std::endl;
return 1;
}
答案 0 :(得分:2)
这不合适:
int *d_num_segments=&Nseg;
您无法获取主机变量的地址并将其用作设备指针。
而是这样做:
int *d_num_segments;
cudaMalloc(&d_num_segments, sizeof(int));
这会在设备上为数据大小(cub将写入的单个整数)分配空间,并将该分配的地址分配给d_num_segments
变量。然后,这将成为有效的设备指针。
在(*普通,非UM)CUDA中,非法取消引用设备代码中的主机地址或主机代码中的设备地址。