我为3x3 Sobel滤波器编写了一个OpenCL内核,目前它在2k x 2k图像上运行大约17毫秒。这并不像我希望的那么快;有没有人对如何提高速度有任何建议?我已经按照checklist for optimizing kernels的大部分建议。我的处理器是Intel i5-3450。工作组大小为8x8,工作项数为height x width / 16,在我运行的图像上为2048 x 128.
__kernel void localCacheSobelFilter(
const __global char16* src,
__write_only __global float16* angle,
__write_only __global float16* mag,
const int width,
const int height)
{
// Cache the data we're looking at in __local space
const int row = get_global_id(0);
const int col = get_global_id(1);
const int cacheRow = get_local_id(0) + 1;
const int cacheCol = get_local_id(1) + 1;
__local char16 cache[BLOCK_SIZE + 2][BLOCK_SIZE + 2];
cache[cacheRow][cacheCol] = src[ indexOf(row, col) ];
// --- Deal with the boundary conditions
// This adds in the rows above and below the local block,
// ignoring the corners.
const bool atTopRow = (cacheRow == 1);
const bool atBottomRow = (cacheRow == BLOCK_SIZE);
if(atTopRow) {
cache[0][cacheCol] = src[ indexOf(row - 1, col) ];
} else if (atBottomRow) {
cache[BLOCK_SIZE + 1][cacheCol] = src[ indexOf(row + 1, col) ];
}
// This adds in the columns to the left and right of the local block,
// ignoring the corners.
const bool atLeftCol = (cacheCol == 1);
const bool atRightCol = (cacheCol == BLOCK_SIZE);
if(atLeftCol) {
cache[cacheRow][0].sf = src[ indexOf(row, col - 1) ].sf;
} else if (atRightCol) {
cache[cacheRow][BLOCK_SIZE + 1].s0 = src[ indexOf(row, col + 1) ].s0;
}
// Now finally check the corners
const bool atTLCorner = atTopRow && atLeftCol;
const bool atTRCorner = atTopRow && atRightCol;
const bool atBLCorner = atBottomRow && atLeftCol;
const bool atBRCorner = atBottomRow && atRightCol;
if(atTLCorner) {
cache[0][0].sf = src[ indexOf(row - 1, col - 1) ].sf;
} else if (atTRCorner) {
cache[0][BLOCK_SIZE + 1].s0 = src[ indexOf(row - 1, col + 1) ].s0;
} else if (atBLCorner) {
cache[BLOCK_SIZE + 1][0].sf = src[ indexOf(row + 1, col - 1) ].sf;
} else if (atBRCorner) {
cache[BLOCK_SIZE + 1][BLOCK_SIZE + 1].s0 = src[ indexOf(row + 1, col + 1) ].s0;
}
barrier(CLK_LOCAL_MEM_FENCE);
//===========================================================================
// Do the calculation
// [..., pix00] upperRow [pix02, ...]
// [..., pix10] centerRow [pix12, ...]
// [..., pix20] lowerRow [pix22, ...]
const char pix00 = cache[cacheRow - 1][cacheCol - 1].sf;
const char pix10 = cache[cacheRow ][cacheCol - 1].sf;
const char pix20 = cache[cacheRow + 1][cacheCol - 1].sf;
const char16 upperRow = cache[cacheRow - 1][cacheCol];
const char16 centerRow = cache[cacheRow ][cacheCol];
const char16 lowerRow = cache[cacheRow + 1][cacheCol];
const char pix02 = cache[cacheRow - 1][cacheCol + 1].s0;
const char pix12 = cache[cacheRow ][cacheCol + 1].s0;
const char pix22 = cache[cacheRow + 1][cacheCol + 1].s0;
// Do the calculations for Gy
const char16 upperRowShiftLeft = (char16)(upperRow.s123456789abcdef, pix02);
const char16 upperRowShiftRight = (char16)(pix00, upperRow.s0123456789abcde);
const char16 lowerRowShiftLeft = (char16)(lowerRow.s123456789abcdef, pix22);
const char16 lowerRowShiftRight = (char16)(pix20, lowerRow.s0123456789abcde);
const float16 Gy = convert_float16(
(upperRowShiftLeft + 2 * upperRow + upperRowShiftRight)
- (lowerRowShiftLeft + 2 * lowerRow + lowerRowShiftRight));
// Do the calculations for Gx
const char16 centerRowShiftLeft = (char16)(centerRow.s123456789abcdef, pix12);
const char16 centerRowShiftRight = (char16)(pix10, centerRow.s0123456789abcde);
const float16 Gx = convert_float16(
(upperRowShiftRight + 2 * centerRowShiftRight + lowerRowShiftRight)
- (upperRowShiftLeft + 2 * centerRowShiftLeft + lowerRowShiftLeft));
// Find the angle and magnitude
angle[ indexOf(row, col) ] = 0.0; //atan2(Gy, Gx);
mag[ indexOf(row, col) ] = ALPHA * max(Gx, Gy) + BETA * min(Gx, Gy);
}
非常感谢任何帮助。谢谢!
答案 0 :(得分:2)
在你的内核中你有很多“if”来避免边缘效应。但99%的时间是耗时且无用的。 我认为您可以在NDRangeKernel中更改全局工作大小,并使用偏移量来避免边缘效应。
例: 偏移量= {1,1,0} GlobalWorkSize = {width-2,height-2,0}