由于某种原因,我在特定内核中设置的断点被完全忽略了...我已经用cudaGetLastError()
检查了错误状态,它告诉我一切运行正常,所以我很确定这应该意味着内核已执行。放置printf
语句也不会产生额外信息,因为没有打印任何内容。即使在调试模式下 输入的内核中,printf
调用也无效。这里可能出现什么问题?!
我们在Tesla M2075(驱动程序版本295.41)上运行Cuda 4.2。 调试时输出:
(cuda-gdb) break cudaCalcBeamIntersect
Breakpoint 1 at 0x401cfb: file cudacalcbeamintersect.cu, line 109.
(cuda-gdb) r
Starting program: /home/heit/cuda/vfind/vfind singleevent.txt 1 1 1
[Thread debugging using libthread_db enabled]
[New Thread 0x7ffff5dd5700 (LWP 20241)]
[Context Create of context 0x634220 on Device 0]
[Launch of CUDA Kernel 0 (memset32_post<<<(64,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 1 (memset32_post<<<(8,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 2 (memset32_post<<<(64,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 3 (memset32_post<<<(1,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 4 (memset32_post<<<(1,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 5 (memset32_post<<<(8,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 6 (cudaInitializeGlobals<<<(256,1,1),(128,1,1)>>>) on Device 0]
no error
[Launch of CUDA Kernel 7 (cudaCalcBeamIntersect<<<(256,1,1),(128,1,1)>>>) on Device 0]
no error
Elapsed time: 0.876842 seconds.
[Thread 0x7ffff5dd5700 (LWP 20241) exited]
[Termination of CUDA Kernel 6 (cudaInitializeGlobals<<<(256,1,1),(128,1,1)>>>) on Device 0]
Program exited normally.
通过调用cout << cudaGetErrorString(cudaGetLastError()) << '\n';
在内核外打印“无错误”打印,并指出cudaInitializeGlobals()
(可以在cuda-gdb中逐步执行)和cudaCalcBeamIntersect()
都执行没有问题。然而,后者无法调试。
有问题的内核仍然是初级内核,并计算一些值存储在(静态)全局内存中。这些值没有其他任何功能,编译器是否可以完全优化此调用?如果是这样,为什么?? !!以及如何防止这种行为? (-O0无效)
干杯!
编辑 - 代码:
**代码调用内核**
uint const nEvents = events.size(); // total number of events
/* Not important ... */
// Allocate memory to hold the events
Track *dev_events;
cudaMalloc(&dev_events, linearEvents.size() * sizeof(Track));
// Copy all events to the GPU
cudaMemcpy(dev_events, &linearEvents[0], linearEvents.size() * sizeof(Track), cudaMemcpyHostToDevice);
// Initialize the global data, like the histogram and the array of z-values
cudaInitializeGlobals <<< tpb, bpg >>> ();
cout << cudaGetErrorString(cudaGetLastError()) << '\n';
cout << "Processing " << nEvents << " event(s)\n";
uint linearIdx = 0;
for (uint event = 0; event != nEvents; ++event)
{
uint nTracks = events[event].size();
if (nTracks > MAX_NUMBER_OF_TRACKS)
{
cout << "Number of tracks in event " << event << " exceeds maximum number of tracks.\n";
exit(1);
}
cudaCalcBeamIntersect <<< tpb, bpg >>> (dev_events + linearIdx, nTracks, bipThresh, binWidth);
cout << cudaGetErrorString(cudaGetLastError()) << '\n';
// Update linear index
linearIdx += nTracks;
}
cudacalcbeamintersect.cu
#include "vfind.cuh"
__device__ float dev_zMin;
__device__ float dev_zMax;
__device__ float dev_zValues[MAX_NUMBER_OF_TRACKS];
__device__ uint dev_histogram[MAX_NUMBER_OF_BINS];
__constant__ Track dev_beam =
{
{0, 0, 1},
{0, 0, 0}
};
__global__ void cudaInitializeGlobals()
{
uint const tid = threadIdx.x + blockIdx.x * blockDim.x;
uint const nThreads = blockDim.x * gridDim.x;
if (tid == 0)
{
dev_zMin = 1e6;
dev_zMax = -1e6;
}
uint idx = tid;
while (idx < MAX_NUMBER_OF_BINS || idx < MAX_NUMBER_OF_TRACKS)
{
if (idx < MAX_NUMBER_OF_BINS)
dev_histogram[idx] = 0;
if (idx < MAX_NUMBER_OF_TRACKS)
dev_zValues[idx] = 0;
idx += nThreads;
}
}
__device__ float dot(float const v1[3], float const v2[3])
{
// Stuff
}
__device__ float distance(Track const &t1, Track const &t2)
{
// Even more boring unimportant stuff
}
__device__ Vertex vertex(Track const &t1, Track const &t2)
{
// Yet even more boring unimportant stuff
}
__global__ void cudaCalcBeamIntersect(Track const *tracks, uint nTracks, float bipTresh, float binWidth)
{
uint const tid = threadIdx.x + blockIdx.x * blockDim.x;
uint const nThreads = blockDim.x * gridDim.x;
uint idx = tid;
while (idx < nTracks)
{
float dist = distance(tracks[idx], dev_beam);
if (dist < bipTresh)
{
float z = vertex(tracks[idx], dev_beam).z;
if (z < dev_zMin)
atomicExch(&dev_zMin, z);
if (z > dev_zMax)
atomicExch(&dev_zMax, z);
dev_zValues[idx] = z;
}
idx += nThreads;
}
__syncthreads();
// To be continued here
}
答案 0 :(得分:1)
@JorenHeit您的内核cudaCalcBeamIntersect
具有全局内存副作用,不应该进行优化。基于发布的cuda-gdb输出,看起来启动工作的主机线程没有等待工作完成(通过cudaDeviceSynchronize()
调用或通过cudaMemcpy
从设备到主机) 。结果,在cudaCalcBeamIntersect kernel
可以在GPU上执行之前,主机线程正在退出。请在应用程序中每次启动内核后尝试添加cudaDeviceSynchronize()
调用。