我的程序使用CUDA Radix排序类。在从CUDA 4.0更新到4.2之后,类辅助初始化函数崩溃,并显示消息“堆栈变量'devprop'已损坏”。我已经隔离了注释一些功能代码的问题,发现cudaGetDeviceProperties正在破坏devprop。我只是不知道为什么会这样,以及如何解决问题。我的设置是CUDA 4.2,开发驱动程序301.32,Nsight 2.2,Windows 7 64位,为Win32编译。以下代码段包含崩溃的initDeviceParameters()辅助函数:
namespace nvRadixSort
{
#include "radixsort.h"
#include "cudpp/cudpp.h"
#include <stdio.h>
#include <assert.h>
bool bManualCoalesce = false;
bool bUsePersistentCTAs = false;
void initDeviceParameters(bool keysOnly)
{
int deviceID = -1;
if(cudaSuccess == cudaGetDevice(&deviceID))
{
cudaDeviceProp devprop;
cudaGetDeviceProperties(&devprop, deviceID);
int smVersion = devprop.major * 10 + devprop.minor;
// sm_12 and later devices don't need help with coalesce in reorderData kernel
bManualCoalesce = (smVersion < 12);
bUsePersistentCTAs = (smVersion < 20);
if(bUsePersistentCTAs)
{
//Irrelevant. My setup is 2.1
}
}
}
}
这是相关的类代码:
#include <cuda_runtime_api.h>
#include "cudpp/cudpp.h"
namespace nvRadixSort
{
class RadixSort
{
public:
RadixSort(unsigned int maxElements, bool keysOnly = false)
: mScanPlan(0),
mNumElements(0),
mTempKeys(0),
mTempValues(0),
mCounters(0),
mCountersSum(0),
mBlockOffsets(0)
{
// Allocate temporary storage
initialize(maxElements, keysOnly);
}
protected: // data
CUDPPHandle mCudppContext;
CUDPPHandle mScanPlan; // CUDPP plan handle for prefix sum
unsigned int mNumElements; // Number of elements of temp storage allocated
unsigned int *mTempKeys; // Intermediate storage for keys
unsigned int *mTempValues; // Intermediate storage for values
unsigned int *mCounters; // Counter for each radix
unsigned int *mCountersSum; // Prefix sum of radix counters
unsigned int *mBlockOffsets; // Global offsets of each radix in each block
protected: // methods
void initialize(unsigned int numElements, bool keysOnly)
{
// initialize parameters based on present CUDA device
initDeviceParameters(keysOnly);
// Allocate temporary storage
mNumElements = numElements;
unsigned int numBlocks = ((numElements % (CTA_SIZE * 4)) == 0) ?
(numElements / (CTA_SIZE * 4)) : (numElements / (CTA_SIZE * 4) + 1);
unsigned int numBlocks2 = ((numElements % (CTA_SIZE * 2)) == 0) ?
(numElements / (CTA_SIZE * 2)) : (numElements / (CTA_SIZE * 2) + 1);
// Initialize scan
cudppCreate(&mCudppContext);
CUDPPConfiguration scanConfig;
scanConfig.algorithm = CUDPP_SCAN;
scanConfig.datatype = CUDPP_UINT;
scanConfig.op = CUDPP_ADD;
scanConfig.options = CUDPP_OPTION_EXCLUSIVE | CUDPP_OPTION_FORWARD;
cudppPlan(mCudppContext , &mScanPlan, scanConfig, 16 * numBlocks2, 1, 0);
cudaMalloc((void **)&mTempKeys, numElements * sizeof(unsigned int));
if(!keysOnly)
cudaMalloc((void **)&mTempValues, numElements * sizeof(unsigned int));
cudaMalloc((void **)&mCounters, WARP_SIZE_ * numBlocks * sizeof(unsigned int));
cudaMalloc((void **)&mCountersSum, WARP_SIZE_ * numBlocks * sizeof(unsigned int));
cudaMalloc((void **)&mBlockOffsets, WARP_SIZE_ * numBlocks * sizeof(unsigned int));
checkCudaError("RadixSort::initialize()");
}
}
答案 0 :(得分:1)
解决了这个问题。即使CUDA 4.2版本可用,我的Visual Studio项目仍然使用CUDA 4.0构建规则和工具。刚刚更改了项目文件以使用新文件,这就是诀窍。