We have a fairly a single kernel (see below) that we fire off with a grid, block of 1,1
kernel<<<1,1>>>
And then it dynamically fires off lots of smaller kernels. In general, the data flows from kernel to kernel, with the input starting with the first kernel and flowing to to the end.
But we have identified a potential ability to overlap two streams of data each running this identical kernel.
Question: Do we have to abandon dynamic kernel execution, and go to a host based approach to get overlap of execution for two mega-kernels? Or is the scheduler in the card smart enough to interleave the execution between the two mega-kernels and process each one as a separate scheduled item?
We are talking about a Tesla K80. Linux host.
(yes, we will gain some overlap with the cudamemcopyasync() overlapping the execution, but we would like to see some execution overlap also).
#include <cuda.h>
#include <cuda_runtime.h>
#include "coss_types.h"
#include "image.h"
#include "centroid.h"
#include "gpu.h"
#define GPU_TILE_WIDTH 16
#define GPU_TILE_HEIGHT 16
#define GPU_TILE_WBIG 32
#define GPU_TILE_HBIG 32
#define K_IMG_MAX 1024
__constant__ unsigned short* pFrameStack[GPU_CHX];
__constant__ unsigned short* pBackground[GPU_CHX];
__constant__ short* pCleanground[GPU_CHX];
__constant__ unsigned char* pMask[GPU_CHX];
__constant__ float* pForeground[GPU_CHX];
__constant__ float* pLowground[GPU_CHX];
__constant__ float* pLowgroundRow[GPU_CHX];
__constant__ float* pHighground[GPU_CHX];
__constant__ float* pHighgroundRow[GPU_CHX];
__constant__ float* pMins[GPU_CHX];
__constant__ float* pMaxs[GPU_CHX];
__constant__ int gSlot;
__constant__ int* pPercentile[GPU_CHX];
__constant__ int* pLabels1[GPU_CHX];
__constant__ int* pLabels2[GPU_CHX];
__constant__ int* pRawLabels[GPU_CHX];
__constant__ int* pLabels[GPU_CHX];
__constant__ ImgInfoBlock_t* pInfo[GPU_CHX];
__constant__ unsigned short* pSums[GPU_CHX];
__constant__ unsigned short* pBlockSums[GPU_CHX];
__constant__ ImgCentroid_t* pCenters[GPU_CHX];
__constant__ float threshold_sigma = 9.0f;
/* INCLUDED GENERATED CUDA CODE BELOW */
#include "cuda.cu"
/* INCLUDED GENERATED CUDA CODE ABOVE */
extern "C" __device__ void Background(int ch)
{
dim3 block;
dim3 grid;
/* Background Estimation */
block = dim3(128);
grid = dim3((IMG_PIXELS)/256); /* Only half screens at a time */
gMedian<<<grid,block>>>(
pFrameStack[ch],GPU_STACKSIZE,IMG_PIXELS,
pBackground[ch],IMG_HEIGHT,IMG_WIDTH,gSlot);
cudaDeviceSynchronize();
/* Background Removal */
block = dim3(128);
grid = dim3((IMG_PIXELS)/128);
gScrub<<<grid,block>>>(
pFrameStack[ch],GPU_STACKSIZE,IMG_PIXELS,
pBackground[ch],IMG_HEIGHT,IMG_WIDTH,
pCleanground[ch],IMG_HEIGHT,IMG_WIDTH,gSlot);
cudaDeviceSynchronize();
}
extern "C" __device__ void Convolution(int ch)
{
dim3 block;
dim3 grid;
dim3 block_b;
dim3 grid_b;
/* Convolve Rows */
block = dim3(GPU_TILE_WIDTH,GPU_TILE_HEIGHT);
grid = dim3(IMG_WIDTH/GPU_TILE_WIDTH,IMG_HEIGHT/GPU_TILE_HEIGHT);
gConvolveRow<<<grid,block>>>(
pCleanground[ch], IMG_HEIGHT,IMG_WIDTH,
pLowgroundRow[ch], IMG_HEIGHT,IMG_WIDTH);
block_b = dim3(GPU_TILE_WBIG,GPU_TILE_HBIG);
grid_b = dim3(IMG_WIDTH/GPU_TILE_WBIG,IMG_HEIGHT/GPU_TILE_HBIG);
gConvolveBigRow<<<grid_b,block_b>>>(
pCleanground[ch], IMG_HEIGHT,IMG_WIDTH,
pHighgroundRow[ch], IMG_HEIGHT,IMG_WIDTH);
/* Convolve Cols */
cudaDeviceSynchronize();
gConvolveCol<<<grid,block>>>(
pLowgroundRow[ch], IMG_HEIGHT,IMG_WIDTH,
pLowground[ch], IMG_HEIGHT,IMG_WIDTH);
gConvolveBigCol<<<grid_b,block_b>>>(
pHighgroundRow[ch], IMG_HEIGHT,IMG_WIDTH,
pHighground[ch], IMG_HEIGHT,IMG_WIDTH);
/* Band pass */
cudaDeviceSynchronize();
block = dim3(256,4);
grid = dim3(IMG_WIDTH / 256, IMG_HEIGHT / 4);
gBpass<<<grid,block>>>(
pLowground[ch], IMG_HEIGHT,IMG_WIDTH,
pHighground[ch], IMG_HEIGHT,IMG_WIDTH,
pForeground[ch], IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
}
extern "C" __device__ void Threshold(int ch)
{
dim3 block;
dim3 grid;
/* Set the calibration sigma in Info Bloc */
pInfo[ch]->sigma = threshold_sigma;
/* Min Max kernels */
block = dim3(512, 2);
grid = dim3(IMG_WIDTH / 512, IMG_HEIGHT / 2);
gMinMax<<<grid,block>>>(
pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
pMins[ch], 5 * K_IMG_MAX,
pMaxs[ch], 5 * K_IMG_MAX);
cudaDeviceSynchronize();
block = dim3(K_IMG_MAX);
grid = dim3(1);
gMinMaxMinMax<<<grid,K_IMG_MAX>>>(
pMins[ch], 5 * K_IMG_MAX,
pMaxs[ch], 5 * K_IMG_MAX,
(struct PipeInfoBlock*)pInfo[ch],1);
/* Histogram */
cudaDeviceSynchronize();
block = dim3(GPU_TILE_WBIG,GPU_TILE_HBIG);
grid = dim3(IMG_WIDTH/GPU_TILE_WBIG,IMG_HEIGHT/GPU_TILE_HBIG);
gHistogram<<<grid,block>>>(
pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
pPercentile[ch],K_IMG_MAX,
(struct PipeInfoBlock*)pInfo[ch],1);
cudaDeviceSynchronize();
block = dim3(K_IMG_MAX);
grid = dim3(1);
gSumHistogram<<<grid,block>>>(pPercentile[ch],K_IMG_MAX);
cudaDeviceSynchronize();
gIQR<<<grid,block>>>(pPercentile[ch],K_IMG_MAX,(struct PipeInfoBlock*)pInfo[ch],1);
cudaDeviceSynchronize();
block = dim3(256,4);
grid = dim3(IMG_WIDTH / 256, IMG_HEIGHT / 4);
gThreshold<<<grid,block>>>(
pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
pMask[ch],IMG_HEIGHT,IMG_WIDTH,
(struct PipeInfoBlock*)pInfo[ch],1);
cudaDeviceSynchronize();
}
extern "C" __device__ void Gluing(int ch)
{
dim3 block;
dim3 grid;
block = dim3(24, 24);
grid = dim3(IMG_WIDTH / 16, IMG_HEIGHT / 16);
gGlue<<<grid, block>>>(
pMask[ch],IMG_HEIGHT,IMG_WIDTH,
pMask[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
}
extern "C" __device__ void Labeling(int ch)
{
dim3 block;
dim3 grid;
/* CCL */
//block = dim3(1, 128);
//grid = dim3(1, IMG_HEIGHT / 128);
block = dim3(256,1);
grid = dim3(IMG_WIDTH/256,IMG_HEIGHT);
gCCL0<<<grid, block>>>(
pMask[ch],IMG_HEIGHT,IMG_WIDTH,
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
block = dim3(24, 24);
grid = dim3(IMG_WIDTH / 16, IMG_HEIGHT / 16);
gCCLMerge<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
gCCLMerge<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
gCCLMerge<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
}
extern "C" __device__ void Relabeling(int ch)
{
dim3 block;
dim3 grid;
/* Relabel */
block = dim3(160, 1);
grid = dim3(IMG_WIDTH / 160, IMG_HEIGHT / 1);
gScan<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pSums[ch],IMG_PIXELS);
cudaDeviceSynchronize();
grid = dim3(IMG_PIXELS / K_IMG_MAX);
gSum<<<grid, K_IMG_MAX>>>(pSums[ch],IMG_PIXELS, pBlockSums[ch], 5*K_IMG_MAX);
cudaDeviceSynchronize();
grid = dim3(1);
gSumBlocks<<<grid, K_IMG_MAX>>>(pBlockSums[ch], 5*K_IMG_MAX, (struct PipeInfoBlock*)pInfo[ch],1);
cudaDeviceSynchronize();
grid = dim3(IMG_PIXELS / K_IMG_MAX);
gFixSums<<<grid, K_IMG_MAX>>>(pSums[ch],IMG_PIXELS, pBlockSums[ch], 5*K_IMG_MAX);
cudaDeviceSynchronize();
block = dim3(160, 1);
grid = dim3(IMG_WIDTH / 160, IMG_HEIGHT / 1);
gRelabeler<<<grid, block>>>(
pRawLabels[ch],IMG_HEIGHT,IMG_WIDTH,
pSums[ch],IMG_PIXELS,
pLabels[ch], IMG_HEIGHT,IMG_WIDTH);
cudaDeviceSynchronize();
}
extern "C" __device__ void Centroiding(int ch)
{
dim3 block;
dim3 grid;
int starcount = IMG_STARS_MAX;
if (pInfo[ch]->starCount > 0 && pInfo[ch]->starCount < IMG_STARS_MAX)
{
starcount = pInfo[ch]->starCount;
/* Centroid */
block = dim3(32, 32);
grid = dim3(IMG_WIDTH / 32, IMG_HEIGHT / 32);
gCentroid<<<grid, block>>>(
pLabels[ch], IMG_HEIGHT,IMG_WIDTH,
pForeground[ch],IMG_HEIGHT,IMG_WIDTH,
(PipeCentroid *)pCenters[ch],starcount);
cudaDeviceSynchronize();
block = dim3(starcount);
gCentroidFinal<<<1, block>>>((PipeCentroid *)pCenters[ch],starcount);
cudaDeviceSynchronize();
}
else
{
pInfo[ch]->starCount = 0;
}
}
extern "C" __global__ void gPipeline(int gpuId)
{ int ch;
for(ch=0; ch < GPU_CHX; ch++)
{
Background(ch);
Convolution(ch);
Threshold(ch);
Gluing(ch);
Labeling(ch);
Relabeling(ch);
Centroiding(ch);
}
}
extern "C" {
static void ImgKernel_ClearBuffers(int32_t gpu, int32_t ch)
{
/* Clear Work Buffers */
cudaMemset(gInfo[gpu][ch],0,(int)sizeof(ImgInfoBlock_t));
cudaMemset(gCenters[gpu][ch],0,(int)sizeof(ImgCentroid_t)*IMG_STARS_MAX);
cudaMemset(gPercentile[gpu][ch],0,(int)sizeof(int32_t)*K_IMG_MAX);
cudaMemset(gLabels1[gpu][ch],0,(int)sizeof(int32_t) *IMG_PIXELS);
cudaMemset(gLabels2[gpu][ch],0,(int)sizeof(int32_t) *IMG_PIXELS);
cudaMemset(gRawLabels[gpu][ch],0,(int)sizeof(int32_t) *IMG_PIXELS);
cudaMemset(gSums[gpu][ch],0,(int)IMG_BYTES);
cudaMemset(gBlockSums[gpu][ch],0,(int)sizeof(uint16_t)*5*K_IMG_MAX);
}
void ImgKernel_Pipeline(int gpu)
{
cudaSetDevice(gpu);
cudaDeviceSynchronize();
/* Start a new run by clearing the buffers */
ImgKernel_ClearBuffers(gpu,GPU_CH0);
ImgKernel_ClearBuffers(gpu,GPU_CH1);
/* Update Constants */
cudaMemcpyToSymbol(gSlot,(void*)&slot,sizeof(slot));
cudaMemcpyToSymbol(threshold_sigma,(void*)&sigmaThreshold,sizeof(sigmaThreshold));
/* Start the next pipeline kernel */
gPipeline<<<1,1>>>(gpu);
}
#define LFILTER_LEN 15
static float lFilter[LFILTER_LEN] = { .0009f, .01f,
.02f, .05f, .08f, .10f, .1325f, .1411f, .1325f, .10f, .08f, .05f, .02f, .01f, .0009f };
#define HFILTER_LEN 31
static float hFilter[HFILTER_LEN] = {0.0002f, 0.0006f,
0.0025f, 0.0037f, 0.0053f, 0.0074f, 0.0099f, 0.0130f, 0.0164f,
0.0201f, 0.0239f, 0.0275f, 0.0306f, 0.0331f, 0.0347f, 0.0353f,
0.0347f, 0.0331f, 0.0306f, 0.0275f, 0.0239f, 0.0201f, 0.0164f,
0.0130f, 0.0099f, 0.0074f, 0.0053f, 0.0037f, 0.0025f, 0.0006f, 0.0002f};
static float32_t kernel[LFILTER_LEN];
static float32_t kernelBig[HFILTER_LEN];
static inline float32_t ImgKernel_FilterSum(float* arr, int32_t len)
{
int32_t i;
float32_t sum = 0.0f;
for (i=0;i<len;i++) sum += arr[i];
return sum;
}
void ImgKernel_Setup(int gpu)
{
int32_t i,ch;
float32_t sum = 0;
sum = ImgKernel_FilterSum(lFilter,LFILTER_LEN);
for (i = 0; i < LFILTER_LEN; i++) kernel[i] = lFilter[i] / sum;
sum = ImgKernel_FilterSum(hFilter,HFILTER_LEN);
for (i = 0; i < HFILTER_LEN; i++) kernelBig[i] = hFilter[i] / sum;
/* One time copy of locations into GPU constant memory */
cudaMemcpyToSymbol(gkernel, (void*)&kernel, sizeof(float32_t)*LFILTER_LEN);
cudaMemcpyToSymbol(gkernelBig, (void*)&kernelBig, sizeof(float32_t)*HFILTER_LEN);
cudaMemcpyToSymbol(pFrameStack,(void*)&gFrameStack[gpu][0], sizeof(uint16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pBackground,(void*)&gBackground[gpu][0], sizeof(uint16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pCleanground,(void*)&gCleanground[gpu][0], sizeof(int16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLowground, (void*)&gLowground[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLowgroundRow,(void*)&gLowgroundRow[gpu][0],sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pHighground,(void*)&gHighground[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pHighgroundRow,(void*)&gHighgroundRow[gpu][0],sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pForeground,(void*)&gForeground[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pMask, (void*)&gMask[gpu][0], sizeof(uint8_t*)*GPU_CHX);
cudaMemcpyToSymbol(pPercentile,(void*)&gPercentile[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pMins, (void*)&gMins[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pMaxs, (void*)&gMaxs[gpu][0], sizeof(float32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLabels1, (void*)&gLabels1[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLabels2, (void*)&gLabels2[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pRawLabels, (void*)&gRawLabels[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pLabels, (void*)&gLabels[gpu][0], sizeof(int32_t*)*GPU_CHX);
cudaMemcpyToSymbol(pInfo, (void*)&gInfo[gpu][0], sizeof(ImgInfoBlock_t*)*GPU_CHX);
cudaMemcpyToSymbol(pSums, (void*)&gSums[gpu][0], sizeof(uint16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pBlockSums, (void*)&gBlockSums[gpu][0], sizeof(uint16_t*)*GPU_CHX);
cudaMemcpyToSymbol(pCenters, (void*)&gCenters[gpu][0], sizeof(ImgCentroid_t*)*GPU_CHX);
for (ch = 0; ch < GPU_CHX; ch++)
{
/* Clear the working buffers */
ImgKernel_ClearBuffers(gpu,ch);
}
}
}
1 个答案:
答案 0 :(得分:1)
对于在单独的主机流中启动的两个动态并行内核,父内核和子内核都应该可以共存(即并发执行)。
如何让事情同时运行是一个常见的问题。一旦满足所有the requirements,您是否真正见证并发内核执行将是每个内核消耗的资源问题:每个块有多少个线程,总线程块数,寄存器数以及共享多少内存是资源类型的一些示例,如果由一个内核使用,可能会阻止另一个内核的并发执行,即使已满足所有要求。
机器没有无限容量。一旦消耗了机器的容量,暴露额外的并行性(例如,通过尝试同时启动独立内核)可能不会产生任何改进。
GPU调度行为可能会影响这一点,Greg也指出了这一点。根据特定的GPU和CUDA版本以及可能的其他因素,具有大量线程块的两个内核可能不会“同时”执行,因为一个内核的线程块可能在调度其他内核的任何线程块之前全部被调度。在我看来,这种行为只是资源问题的另一种表现形式。 (另请注意,单个内核的线程块的调度也可能受stream priorities)的影响。
但是,如果我们小心地限制资源使用,则两个动态并行内核的父内核和子内核可以共存,即同时执行。这是一个有效的例子(CUDA 7,Fedora 20,GeForce GT640 cc3.5 GPU):
$ cat t815.cu
#include <stdio.h>
#define DELAY_VAL 5000000000ULL
__global__ void child(){
unsigned long long start = clock64();
while (clock64()< start+DELAY_VAL);
}
__global__ void parent(){
child<<<1,1>>>();
}
int main(int argc, char* argv[]){
cudaStream_t st1, st2;
cudaStreamCreate(&st1);
cudaStreamCreate(&st2);
parent<<<1,1,0,st1>>>();
if (argc > 1){
printf("running double kernel\n");
parent<<<1,1,0,st2>>>();
}
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_35 -rdc=true -lcudadevrt t815.cu -o t815
$ time ./t815
3.65user 1.88system 0:05.65elapsed 97%CPU (0avgtext+0avgdata 82192maxresident)k
0inputs+0outputs (0major+2812minor)pagefaults 0swaps
$ time ./t815 double
running double kernel
3.68user 1.83system 0:05.64elapsed 97%CPU (0avgtext+0avgdata 82200maxresident)k
0inputs+0outputs (0major+2814minor)pagefaults 0swaps
$ time cuda-memcheck ./t815
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
3.16user 2.25system 0:05.68elapsed 95%CPU (0avgtext+0avgdata 87040maxresident)k
0inputs+0outputs (0major+4573minor)pagefaults 0swaps
$ time cuda-memcheck ./t815 double
========= CUDA-MEMCHECK
running double kernel
========= ERROR SUMMARY: 0 errors
7.27user 3.04system 0:10.46elapsed 98%CPU (0avgtext+0avgdata 87116maxresident)k
0inputs+0outputs (0major+4594minor)pagefaults 0swaps
$
在这种情况下,我们看到如果我不使用cuda-memcheck
,那么无论我是否在单独的主机流中运行一个或两个(父)内核副本,执行时间大致相同( 〜5.6s)。由于执行时间相同,不可避免的结论是这些内核同时执行(父内核和子内核)。这并不太令人惊讶,因为这些内核的资源使用很少。 (每个线程一个,每个线程一个,具有非常低的寄存器使用率,没有共享内存使用)。
另一方面,如果我使用cuda-memcheck
运行相同的测试,则会有明显的序列化,因为虽然单个内核启动的时间相对不受影响,但两个“并发”内核启动的时间大致相同。双