Question

我有一个设备浮点数组，我尝试用THRUST用键来对它进行排序：

#include <thrust/sort.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/device_malloc.h>
#include <thrust/device_free.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>    

template <typename T>
__host__ T* deepCopyDeviceArray(T* dev_array, int arraysize)
{
// performs a deep copy of a device array and returns the copy's device pointer

cudaError_t cudaStatus;

T* dev_copiedArray;

cudaStatus = cudaMalloc((void**)&dev_copiedArray, (arraysize * sizeof(T)));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "deep copy cudaMalloc failed!");
}

cudaStatus = cudaMemcpy(dev_copiedArray, dev_array, (arraysize * sizeof(T)), cudaMemcpyDeviceToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "deep copy cudaMemcpy failed!");
}

return dev_copiedArray;
}



template <typename T>
int* sortByKeyOnDevice(T* dev_keys, int len, const int* valuesarray)
{
// sorts keysarray and returns the sorted indices
T* dev_keys2 = deepCopyDeviceArray(dev_keys, len); // make deep copy to evade change of original keys

// make deep copy of values and copy it to device
int* dev_values;
cudaMalloc((void **) &dev_values, len);
cudaMemcpy(dev_values, valuesarray, len * sizeof(int), cudaMemcpyHostToDevice);

// create device pointers
thrust::device_ptr<T> dev_ptr_keys = thrust::device_pointer_cast(dev_keys2);
thrust::device_ptr<int> dev_ptr_values = thrust::device_pointer_cast(dev_values);

thrust::sort_by_key(dev_ptr_keys, dev_ptr_keys + len, dev_ptr_values);

//thrust::device_free(dev_ptr_keys);
cudaFree(dev_keys2);
return dev_values; // return only indices of sorted array
}

int main()
{
int len = 10;
float* array1 = new float[len]; for (int i=0;i<len;i++) array1[i] = rand();

float* dev_array1;
cudaMalloc(&dev_array1, len * sizeof(float));
cudaMemcpy(dev_array1, array1, (len * sizeof(float)), cudaMemcpyHostToDevice);

int* valuesarray = new int[len]; for (int i=0; i<len; i++) valuesarray[i] = i;
int* dev_values;

dev_values = sortByKeyOnDevice(dev_array1, len, valuesarray);

int* values = new int[len];
cudaMemcpy(values, dev_values, (len * sizeof(int)), cudaMemcpyDeviceToHost); // or use dev_values in a kernel for further calculations
}

执行此main，THRUST抛出异常：“thrust :: system :: system_error at memory location 0x00DAF5D4。”

我不想使用推力装置矢量，因为这里不需要它们。根据THRUST文档，device_ptr可以按上述方式使用（我引用this question）。

我做错了什么？

Answer 1

每当您遇到CUDA代码时遇到问题，都应该在每个 CUDA API调用和内核调用上放置proper cuda error checking（对于推力调用，您不需要这样做，他们有他们自己的错误报告机制）。您还可以使用cuda-memcheck运行代码，即使您没有明确检查它们，也会显示API错误。

如果您已经这样做了，您会发现这行代码报告了API错误（无效参数）：

cudaMemcpy(dev_values, valuesarray, len * sizeof(int), cudaMemcpyHostToDevice);

查看之前的行，您的尺寸参数不正确：

cudaMalloc((void **) &dev_values, len);

应该是：

cudaMalloc((void **) &dev_values, len*sizeof(int));

通过这项更改，您的代码可以编译并为我运行而不会出现任何错误。

顺便说一句，在发布代码时，请正确格式化（缩进）以便其他人阅读。

按设备上的按键排序引发错误

1 个答案: