问题1: 如果共享内存仅由子内核使用,是否必须指定在启动父内核时分配的动态共享内存量。
问题2: 以下是我的子内核和父内核
父内核
__global__ void ColumnFractionalShift(DataIn DataInput,float* __restrict__ DeviceInput, float ShiftAmount, float* __restrict__ LightFieldDevice)
{
cudaError_t status;
float ImageShift = threadIdx.x*ShiftAmount;
float ImageIntegerShift = nearbyintf(ImageShift);
float Delay = ImageShift - ImageIntegerShift;
int InputImageOffset = +DataInput.X*DataInput.Y*DataInput.U*(threadIdx.y) + DataInput.X*DataInput.Y*(threadIdx.x);
dim3 dimBlock(32, 24);
dim3 dimGrid(16, 14);
//if (threadIdx.x > 5)
{
ConvolutionColumn << <dimGrid, dimBlock, ((sizeof(float)* 24 * 32 * 3)) >> >(DataInput, DeviceInput + InputImageOffset, Delay, LightFieldDevice + InputImageOffset);
}
status = cudaGetLastError();
if (status != cudaSuccess) {
printf("failed %s\n", cudaGetErrorString(status));
}
cudaDeviceSynchronize();
if (threadIdx.x == 5)
{
printf("The values at beginig of %d %d are %f\n", threadIdx.x, threadIdx.y, *(LightFieldDevice + InputImageOffset));
}
}
子内核
__global__ void ConvolutionColumn(DataIn DataInput,float* __restrict__ DeviceInput, float Delay, float* __restrict__ DeviceResult)
{
extern __shared__ float ConvolutionBlockLeft[];
int BlockStart = blockDim.y*blockIdx.y*DataInput.V + blockIdx.x*blockDim.x;
//int BlockEnd = BlockStart+(blockDim.x*blockDim.y)-1;
int PixelId = blockDim.x*threadIdx.y + threadIdx.x; //32 by 24 kernal
int LoadPixelId = DataInput.V*threadIdx.y + threadIdx.x;
int LoadLeft,LoadRght,LoadCentre;
float KernalSum;
float DelayPower = Delay;
//load upper values
if (blockIdx.y == 0)
{
LoadLeft = DataInput.V*(blockDim.y - threadIdx.y-1) + threadIdx.x;
}
else
{
LoadLeft = LoadPixelId - (DataInput.V*blockDim.y);
}
*(ConvolutionBlockLeft + (threadIdx.y*blockDim.x) + threadIdx.x) = *(DeviceInput + BlockStart + LoadLeft);
if (blockIdx.y*blockDim.y + threadIdx.y >= DataInput.U)
{
LoadCentre = ((DataInput.U - 1)*DataInput.V) + (blockDim.x*blockIdx.x) + threadIdx.x - ((blockIdx.y*blockDim.y + threadIdx.y) - DataInput.U)*DataInput.V;
}
else
{
LoadCentre = BlockStart+LoadPixelId;
}
*(ConvolutionBlockLeft + (blockDim.x*blockDim.y) + (threadIdx.y*blockDim.x) + threadIdx.x) = *(DeviceInput + LoadCentre);
if (blockIdx.y*blockDim.y + threadIdx.y + blockDim.y >= DataInput.U)
{
LoadRght = ((DataInput.U - 1)*DataInput.V) + (blockDim.x*blockIdx.x) + threadIdx.x - ((((blockIdx.y*blockDim.y) + threadIdx.y + blockDim.y) - DataInput.U)*DataInput.V);
}
else
{
LoadRght = BlockStart+LoadPixelId + (DataInput.V*blockDim.y);
}
//float tempfil, tempdata;
//int t;
*(ConvolutionBlockLeft + (2 * blockDim.x*blockDim.y) + (threadIdx.y*blockDim.x) + threadIdx.x) = *(DeviceInput + LoadRght);
__syncthreads();
float FilterSum = *(ConvolutionBlockLeft + ((blockDim.x*blockDim.y) + PixelId));
for (int k = 1; k < DataInput.KernalNoOfFilters; k++)
{
KernalSum = 0;
//printf("The value of filter size is %d\n", (DeviceFilterSize[k]));
for (int l = -((*(DeviceFilterSize + k) - 1) / 2); l < ((*(DeviceFilterSize + k) + 1) / 2); l++)
{
//tempfil = *(DeviceFilterKernal + k*DataInput.KernalFilterLength + ((*(DeviceFilterSize + k) - 1) / 2) + l);
//t = (blockDim.x*blockDim.y) + PixelId + (l*blockDim.x);
//tempdata = *(ConvolutionBlockLeft + ((blockDim.x*blockDim.y) + PixelId - (l*blockDim.x)));
KernalSum += *(DeviceFilterKernal + k*DataInput.KernalFilterLength + ((*(DeviceFilterSize + k) - 1) / 2) + l)**(ConvolutionBlockLeft + ((blockDim.x*blockDim.y) + PixelId - (l*blockDim.x)));
}
KernalSum *= DelayPower;
DelayPower *= Delay;
FilterSum += KernalSum;
}
if (blockIdx.y*blockDim.y + threadIdx.y < DataInput.U)
{
*(DeviceResult + LoadPixelId + BlockStart) = FilterSum;
}
}
这里的子内核工作正常。但是当它从另一个内核启动时,在父cudaDeviceSynchronize()
未指定启动失败错误的主机内核启动之后(错误不是从内核中的printf打印出来的)。
父内核的启动配置为<<<1,(17 17)>>>
。如果只允许来自父级的一个线程启动子网格,则代码可以正常工作。是否可以从一个区块中发射多少个网格?
答案 0 :(得分:1)
如果共享内存仅由子内核使用,是否必须指定在启动父内核时分配的动态共享内存量。
不,您只在需要/使用它的实际内核调用上指定动态共享内存分配。您不必考虑父内核启动时的子内核需求。
是否可以从一个区块中启动多少个网格?
没有任何此类限制。有一个cudaLimitDevRuntimePendingLaunchCount
,但默认设置为2048,这似乎超出了您的17 * 17 = 289的请求,而且您似乎没有收到此特定错误。 (这是动态限制,因此父内核似乎启动超过此限制的简单事实并不能保证此限制/错误将被命中。这取决于实际的动态启动行为。)