看看我自己编写的cuda内核。我有一个很大的内核,但它返回给我错误信息。然后我简化了它,发现它在一个循环中失败了。我简化了这个循环,发现如果我使用int值或常量值来填充循环中的数据[threadIdx.x],它可以正常工作。但是,如果我使用double类型值,则返回错误。
建议:如果您没有正确地将数据从主机复制到设备,则可以在使用Nsight时收到“警告:检测到Cuda API错误:cudaLaunch返回(0x7)”消息,或者您可以从终端运行应用程序时出现分段错误错误
__global__ void sumSeries(double* dSum,int* totalThreadNumber){
volatile __shared__ double data[768];
double var=0;
data[threadIdx.x]=0;
for ( int i = 10 ; i < 20 ;++i){
var=i;
data[threadIdx.x] += (var)/(var*var+1);
__syncthreads();
}
}
为什么不起作用?
int main() {
int threadsPerBlock=768;
int blockCount=8;
int *hostThreadNumber=new int ;
*hostThreadNumber=threadsPerBlock*blockCount;
int* deviceThreadNumber=NULL;
double* deviceSum=NULL;
double* hostSum=(double*)malloc(blockCount);
cudaError_t cuerr=cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
if (cuerr != cudaSuccess){
std::cout<<"Cant SetCacheConfig: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr=cudaMalloc(&deviceSum,blockCount*sizeof(double));//размер дабла*число блоков
if (cuerr != cudaSuccess){
std::cout<<"Cant allocate memory for deviceSum: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr=cudaMalloc(&deviceThreadNumber,sizeof(int));
if (cuerr != cudaSuccess){
std::cout<<"Cant allocate memory for deviceThreadNumber: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr = cudaMemcpy(deviceSum,hostSum,blockCount*sizeof(double),cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess){
std::cout<<"Can not copy hostSum to device: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr = cudaMemcpy(deviceThreadNumber,hostThreadNumber,sizeof(int),cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess){
std::cout<<"Can not copy hostThreadNumber to device: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
sumSeries<<<dim3(blockCount),dim3(threadsPerBlock)>>>(deviceSum,deviceThreadNumber);
cuerr=cudaGetLastError();
if (cuerr != cudaSuccess){
std::cout<<"Cuda kernel error: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr= cudaDeviceSynchronize();
if (cuerr != cudaSuccess){
std::cout<<"Can not synchronize cuda kernel : "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cuerr= cudaMemcpy(hostSum,deviceSum,blockCount*sizeof(double),cudaMemcpyDeviceToHost);
if (cuerr != cudaSuccess){
std::cout<<"Can not copy data to host: "<<cudaGetErrorString(cuerr)<<std::endl;
return -1;
}
cudaFree(deviceSum);
cudaFree(deviceThreadNumber);
return 0;
}
答案 0 :(得分:1)
您刚刚为hostSum
double* hostSum=(double*)malloc(blockCount)
如果我假设您要为其分配blockCount * sizeof(double)
个字节,那就错了,因为您为deviceSum
分配了这个内存量,并将其用于主机和设备之间的内存复制。
cuerr = cudaMalloc(&deviceSum,blockCount*sizeof(double));
cuerr = cudaMemcpy(deviceSum,hostSum,blockCount*sizeof(double),cudaMemcpyHostToDevice);
cuerr= cudaMemcpy(hostSum,deviceSum,blockCount*sizeof(double),cudaMemcpyDeviceToHost);