我是CUDA分析的初学者。我基本上想要生成一个时间轴,显示每个SM以及在执行时间内分配给它的线程块。
类似的东西:
我读过有关阅读%smid寄存器的内容,但我不知道如何将其与我想要测试的代码相结合,或者如何将其与线程块或时间相关联。
答案 0 :(得分:1)
__noinline__ __device__ uint get_smid(void)
{
uint ret;
asm("mov.u32 %0, %smid;" : "=r"(ret) );
return ret;
}
来源here。
答案 1 :(得分:1)
完整代码超出了本答案的范围,因此本答案为您提供了实现块跟踪的构建块。
可视化数据
分配一个等于块数* 16字节的设备缓冲区。每个16字节的记录将存储开始和结束时间戳以及打包到开始时间的5位smid。
static __device__ inline uint32_t __smid()
{
uint32_t smid;
asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
return smid;
}
// use globaltimer for compute capability >= 3.0 (kepler and maxwell)
// use clock64 for compute capability 2.x (fermi)
static __device__ inline uint64_t __timestamp()
{
uint64_t globaltime;
asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(globaltime) );
return globaltime;
}
__global__ blocktime(uint64_t* pBlockTime)
{
// START TIMESTAMP
uint64_t startTime = __timestamp();
// flatBlockIdx should be adjusted to 1D, 2D, and 3D launches to minimize
// overhead. Reduce to uint32_t if launch index does not exceed 32-bit.
uint64_t flatBlockIdx = (blockIdx.z * gridDim.x * gridDim.y)
+ (blockIdx.y * gridDim.x)
+ blockIdx.x;
// reduce this based upon dimensions of block to minimize overhead
if (threadIdx.x == 0 && theradIdx.y == 0 && threadIdx.z == 0)
{
// Put the smid in the 4 lower bits. If the MultiprocessCounter exceeds
// 16 then increase to 5-bits. The lower 5-bits of globaltimer are
// junk. If using clock64 and you want the improve precision then use
// the most significant 4-5 bits.
uint64_t smid = __smid();
uint64_t data = (startTime & 0xF) | smid;
pBlockTime[flatBlockIdx * 2 + 0] = data;
}
// do work
// I would recommend changing your current __global__ function to be
// a __global__ __device__ function and call it here. This will result
// in easier handling of kernels that have multiple exit points.
// END TIMESTAMP
// All threads in block will write out. This is not very efficient.
// Depending on the kernel this can be reduced to 1 thread or 1 thread per warp.
uint64_t endTime = __timestamp();
pBlockTime[flatBlockIdx * 2 + 1] = endTime;
}