我正在使用cuda toolkit 8.0在Ubuntu 16.04中使用CUDA开发一个应用程序。
我的问题是关于gtx960m(功能5.0)可以包含的每个块的线程数。我正在尝试使用每个块中的最大线程数,因此,我使用cudaGetDeviceProperties()函数来获取此信息(1024个线程,您可以在documentation中看到)但是当我在我的内核中每个块使用超过512个线程,API返回错误代码0x7(“警告:检测到Cuda API错误:cudaLaunch返回(0x7)”),这意味着“启动资源”。
#include <random>
#include <curand.h>
#include <curand_kernel.h>
#define min(a,b) (a<b?a:b);
__global__ void bootstrap_V1(int nSamples, int sampleFraction, int seed, unsigned int* sampleIDs, unsigned int* inbagCounts){
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int offset = gridDim.x * blockDim.x;
/*Generating a random number in a specific ranger:
1- Use CURAND to generate a uniform distribution between 0.0 and 1.0
2- Then multiply this by the desired range (largest value - smallest value + 0.999999).
3- Then add the offset (+ smallest value).
4- Then truncate to an integer.
curandState state;
curand_init(seed, tid, 0, &state);
while(tid < nSamples*sampleFraction){
float randf = curand_uniform(&state);
randf *= ((nSamples - 1) - 0) + 0.999999;
randf += 0;
int rand = (int)truncf(randf);
sampleIDs[tid] = rand;
atomicAdd(&(inbagCounts[rand]), 1);
tid += offset;
int main(void) {
int nSamples = 100;
int sampleFraction = 1;
std::random_device rd;
std::mt19937_64 gen(rd());
cudaError_t error;
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int blocks = prop.multiProcessorCount;
int maxThreadsPerBlock = prop.maxThreadsPerBlock;
int seed = dist(gen);
unsigned int *sampleIDs = (unsigned int *)malloc(nSamples * sampleFraction * sizeof(int));
unsigned int *inbagCounts = (unsigned int *)malloc(nSamples * sizeof(int));
unsigned int *dev_sampleIDs, *dev_inbagCounts;
error = cudaMalloc((void **)&dev_sampleIDs, nSamples*sampleFraction*sizeof(int));
error = cudaMalloc((void **)&dev_inbagCounts, nSamples*sizeof(int));
error = cudaMemset(dev_sampleIDs, 0, nSamples*sampleFraction*sizeof(int));
error = cudaMemset(dev_inbagCounts, 0, nSamples*sizeof(int));
if (error != cudaSuccess)
printf("%s\n", cudaGetErrorString(error));
int threadsPerBlock = min(maxThreadsPerBlock, nSamples);
bootstrap_V1<<<blocks,threadsPerBlock>>>(nSamples, sampleFraction, seed, dev_sampleIDs, dev_inbagCounts);
cudaMemcpy(sampleIDs, dev_sampleIDs, nSamples*sampleFraction*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(inbagCounts, dev_inbagCounts, nSamples*sizeof(int), cudaMemcpyDeviceToHost);
/usr/local/cuda-8.0/bin/nvcc -G -g -O0 -std=c++11 -gencode arch=compute_50,code=sm_50 -odir "." -M -o "main.d" "../main.cu
/usr/local/cuda-8.0/bin/nvcc -G -g -O0 -std=c++11 --compile --relocatable-device-code=false -gencode arch=compute_50,code=compute_50 -gencode arch=compute_50,code=sm_50 -x cu -o "main.o" "../main.cu"
/usr/local/cuda-8.0/bin/nvcc --cudart static --relocatable-device-code=false -gencode arch=compute_50,code=compute_50 -gencode arch=compute_50,code=sm_50 -link -o "prueba" ./main.o
根据要求,PTAX Verbose:
ptxas info : 77696 bytes gmem, 72 bytes cmem[3]
ptxas info : Function properties for cudaDeviceGetAttribute
16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Compiling entry function '_Z12bootstrap_V1iiiPjS_' for 'sm_50'
ptxas info : Function properties for _Z12bootstrap_V1iiiPjS_
6560 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 128 registers, 6560 bytes cumulative stack size, 352 bytes cmem[0], 8 bytes cmem[2]
ptxas info : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessor
32 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Function properties for _ZN4dim3C2Ejjj
16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads
ptxas info : Function properties for cudaMalloc
16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Function properties for cudaGetDevice
8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Function properties for _ZN4dim3C1Ejjj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
40 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Function properties for _ZN61_INTERNAL_39_tmpxft_000013a3_00000000_7_main_cpp1_ii_055b743a9atomicAddEPjj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Function properties for cudaFuncGetAttributes
16 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
此数字必须小于或等于SM中的可用寄存器。最大可用寄存器在CUDA编程指南的table 13中给出(以及GPU的deviceQuery
输出中)。对于cc 5.0 GPU,每个SM的最大寄存器为64K。对于您的代码,每个线程128个寄存器* 1024个线程产生128K寄存器,这将无法工作并导致您看到的错误(“启动请求的资源太多”)。如果您在代码中执行了proper CUDA error checking,则会收到这个基于文本的错误消息,而不是简单的“0x7”错误。
解决此问题的典型方法限制了GPU编译器寄存器的使用。最好的方法是使用launch bounds。使用-maxrregcount
编译器也存在粗略方法。 -maxrregcount
的规范是Nsight Eclipse Edition中的可选选项。