我需要在大图像源上应用高斯滤镜。我已经实现了以下算法。我已经使用霓虹灯进行了优化并获得了显着的性能提升,但仍然需要改进以匹配实时。如果有进一步改进的空间,请特别在霓虹灯代码中提出一些建议。我觉得我的霓虹灯代码没有完全优化,一次可以处理16个像素。我是霓虹灯的初学者,所以无法编写非常好的代码,如果有人可以提供改进的代码,我会非常有帮助。
void BlurRow( src, dest, gausian )
{
process each pixel from src and calculate destination pixel value r g b a
by calling ComputeFinalPixelvalue
}
void BlurImage( src, dest )
{
for each row call BlurRow with gausian kerner gx
transpose matrix
for each row call blur row with gausian kerner gy
transpose matrix
}
void ComputeFinalPixelvalue(const uint32_t* sourcePixels,
uint32_t pixelcount, uint16_t* pGaussElements,
uint32_t& rvalue, uint32_t& gvalue, uint32_t& bvalue, uint32_t& avalue )
{
// initialize all vectors lane with 0
uint32x4_t sumOfChannelG_32x4 = { 0, 0, 0, 0 }, sumOfChannelB_32x4 = { 0, 0, 0, 0 }, sumOfChannelR_32x4 = { 0, 0, 0, 0 }, sumOfChannelA_32x4 = { 0, 0, 0, 0 };
int32x4_t SrcPixels32x4_low, SrcPixels32x4_high, vGaussElement_32x4_low, vGaussElement_32x4_high;
for (int i = 0; i< pixelcount / 8; i++)
{
// load interleaved 8 pixel at a time
uint8x8x4_t SrcPixels8x8x4 = vld4_u8( reinterpret_cast< const unsigned char* >( sourcePixels ) );
// load 8 GaussElement at a time
uint16x8_t vGaussElement_16x8 = vld1q_u16(pGaussElements);
vGaussElement_32x4_low = vmovl_u16(vget_low_u16(vGaussElement_16x8));
vGaussElement_32x4_high = vmovl_u16(vget_high_u16(vGaussElement_16x8));
// channel 0
sumOfChannelR_32x4 = vmlaq_u32(sumOfChannelB_32x4, vmovl_u16(vget_low_u16(vmovl_u8(SrcPixels8x8x4.val[0]))), vGaussElement_32x4_low);
sumOfChannelR_32x4 = vmlaq_u32(sumOfChannelB_32x4, vmovl_u16(vget_high_u16(vmovl_u8(SrcPixels8x8x4.val[0]))), vGaussElement_32x4_high);
/// channel 1
sumOfChannelG_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_low_u16(vmovl_u8(SrcPixels8x8x4.val[1]))), vGaussElement_32x4_low);
sumOfChannelG_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_high_u16(vmovl_u8(SrcPixels8x8x4.val[1]))), vGaussElement_32x4_high);
/// channel 2
sumOfChannelB_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_low_u16(vmovl_u8(SrcPixels8x8x4.val[2]))), vGaussElement_32x4_low);
sumOfChannelB_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_high_u16(vmovl_u8(SrcPixels8x8x4.val[2]))), vGaussElement_32x4_high);
/// channel 3
sumOfChannelB_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_low_u16(vmovl_u8(SrcPixels8x8x4.val[3]))), vGaussElement_32x4_low);
sumOfChannelB_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_high_u16(vmovl_u8(SrcPixels8x8x4.val[3]))), vGaussElement_32x4_high);
sourcePixels = sourcePixels + 8;
pGaussElements = pGaussElements + 8;
}
gvalue += vgetq_lane_u32(sumOfChannelG_32x4, 0) + vgetq_lane_u32(sumOfChannelG_32x4, 1) + vgetq_lane_u32(sumOfChannelG_32x4, 2) + vgetq_lane_u32(sumOfChannelG_32x4, 3);
// simillarily calculate others
}