我有一个设备浮点数组,我尝试用THRUST用键来对它进行排序:
#include <thrust/sort.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/device_malloc.h>
#include <thrust/device_free.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
template <typename T>
__host__ T* deepCopyDeviceArray(T* dev_array, int arraysize)
{
// performs a deep copy of a device array and returns the copy's device pointer
cudaError_t cudaStatus;
T* dev_copiedArray;
cudaStatus = cudaMalloc((void**)&dev_copiedArray, (arraysize * sizeof(T)));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "deep copy cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(dev_copiedArray, dev_array, (arraysize * sizeof(T)), cudaMemcpyDeviceToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "deep copy cudaMemcpy failed!");
}
return dev_copiedArray;
}
template <typename T>
int* sortByKeyOnDevice(T* dev_keys, int len, const int* valuesarray)
{
// sorts keysarray and returns the sorted indices
T* dev_keys2 = deepCopyDeviceArray(dev_keys, len); // make deep copy to evade change of original keys
// make deep copy of values and copy it to device
int* dev_values;
cudaMalloc((void **) &dev_values, len);
cudaMemcpy(dev_values, valuesarray, len * sizeof(int), cudaMemcpyHostToDevice);
// create device pointers
thrust::device_ptr<T> dev_ptr_keys = thrust::device_pointer_cast(dev_keys2);
thrust::device_ptr<int> dev_ptr_values = thrust::device_pointer_cast(dev_values);
thrust::sort_by_key(dev_ptr_keys, dev_ptr_keys + len, dev_ptr_values);
//thrust::device_free(dev_ptr_keys);
cudaFree(dev_keys2);
return dev_values; // return only indices of sorted array
}
int main()
{
int len = 10;
float* array1 = new float[len]; for (int i=0;i<len;i++) array1[i] = rand();
float* dev_array1;
cudaMalloc(&dev_array1, len * sizeof(float));
cudaMemcpy(dev_array1, array1, (len * sizeof(float)), cudaMemcpyHostToDevice);
int* valuesarray = new int[len]; for (int i=0; i<len; i++) valuesarray[i] = i;
int* dev_values;
dev_values = sortByKeyOnDevice(dev_array1, len, valuesarray);
int* values = new int[len];
cudaMemcpy(values, dev_values, (len * sizeof(int)), cudaMemcpyDeviceToHost); // or use dev_values in a kernel for further calculations
}
执行此main,THRUST抛出异常:“thrust :: system :: system_error at memory location 0x00DAF5D4。”
我不想使用推力装置矢量,因为这里不需要它们。根据THRUST文档,device_ptr可以按上述方式使用(我引用this question)。
我做错了什么?
答案 0 :(得分:1)
每当您遇到CUDA代码时遇到问题,都应该在每个 CUDA API调用和内核调用上放置proper cuda error checking(对于推力调用,您不需要这样做,他们有他们自己的错误报告机制)。您还可以使用cuda-memcheck
运行代码,即使您没有明确检查它们,也会显示API错误。
如果您已经这样做了,您会发现这行代码报告了API错误(无效参数):
cudaMemcpy(dev_values, valuesarray, len * sizeof(int), cudaMemcpyHostToDevice);
查看之前的行,您的尺寸参数不正确:
cudaMalloc((void **) &dev_values, len);
应该是:
cudaMalloc((void **) &dev_values, len*sizeof(int));
通过这项更改,您的代码可以编译并为我运行而不会出现任何错误。
顺便说一句,在发布代码时,请正确格式化(缩进)以便其他人阅读。