优化gausian模糊过滤器

时间:2016-02-11 21:27:09

标签: c image-processing optimization arm neon

我需要在大图像源上应用高斯滤镜。我已经实现了以下算法。我已经使用霓虹灯进行了优化并获得了显着的性能提升,但仍然需要改进以匹配实时。如果有进一步改进的空间,请特别在霓虹灯代码中提出一些建议。我觉得我的霓虹灯代码没有完全优化,一次可以处理16个像素。我是霓虹灯的初学者,所以无法编写非常好的代码,如果有人可以提供改进的代码,我会非常有帮助。

void BlurRow( src, dest, gausian )
{
     process each pixel from src and calculate destination pixel value r g b a  
     by calling ComputeFinalPixelvalue
}

void BlurImage( src, dest )
{
   for each row call BlurRow with gausian kerner gx
   transpose matrix
   for each row call blur row with gausian kerner gy
   transpose matrix 
}

void ComputeFinalPixelvalue(const uint32_t* sourcePixels, 
                            uint32_t pixelcount, uint16_t* pGaussElements, 
                            uint32_t& rvalue, uint32_t& gvalue, uint32_t& bvalue, uint32_t& avalue )
{
// initialize all vectors lane with 0

uint32x4_t  sumOfChannelG_32x4 = { 0, 0, 0, 0 }, sumOfChannelB_32x4 = { 0, 0, 0, 0 }, sumOfChannelR_32x4 = { 0, 0, 0, 0 }, sumOfChannelA_32x4 = { 0, 0, 0, 0 };

int32x4_t  SrcPixels32x4_low, SrcPixels32x4_high, vGaussElement_32x4_low, vGaussElement_32x4_high;

for (int i = 0; i< pixelcount / 8; i++)
{
  // load interleaved 8 pixel at a time
  uint8x8x4_t SrcPixels8x8x4 = vld4_u8( reinterpret_cast< const unsigned char* >( sourcePixels ) );

  // load 8 GaussElement at a time
  uint16x8_t vGaussElement_16x8 = vld1q_u16(pGaussElements);

  vGaussElement_32x4_low = vmovl_u16(vget_low_u16(vGaussElement_16x8));
  vGaussElement_32x4_high = vmovl_u16(vget_high_u16(vGaussElement_16x8));

  // channel 0
  sumOfChannelR_32x4 = vmlaq_u32(sumOfChannelB_32x4, vmovl_u16(vget_low_u16(vmovl_u8(SrcPixels8x8x4.val[0]))), vGaussElement_32x4_low);
  sumOfChannelR_32x4 = vmlaq_u32(sumOfChannelB_32x4, vmovl_u16(vget_high_u16(vmovl_u8(SrcPixels8x8x4.val[0]))), vGaussElement_32x4_high);

  /// channel 1
  sumOfChannelG_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_low_u16(vmovl_u8(SrcPixels8x8x4.val[1]))), vGaussElement_32x4_low);
  sumOfChannelG_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_high_u16(vmovl_u8(SrcPixels8x8x4.val[1]))), vGaussElement_32x4_high);

  /// channel 2
  sumOfChannelB_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_low_u16(vmovl_u8(SrcPixels8x8x4.val[2]))), vGaussElement_32x4_low);
  sumOfChannelB_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_high_u16(vmovl_u8(SrcPixels8x8x4.val[2]))), vGaussElement_32x4_high);

  /// channel 3
  sumOfChannelB_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_low_u16(vmovl_u8(SrcPixels8x8x4.val[3]))), vGaussElement_32x4_low);
  sumOfChannelB_32x4 = vmlaq_u32(sumOfChannelG_32x4, vmovl_u16(vget_high_u16(vmovl_u8(SrcPixels8x8x4.val[3]))), vGaussElement_32x4_high);

  sourcePixels = sourcePixels + 8;

  pGaussElements = pGaussElements + 8;
}

gvalue += vgetq_lane_u32(sumOfChannelG_32x4, 0) + vgetq_lane_u32(sumOfChannelG_32x4, 1) + vgetq_lane_u32(sumOfChannelG_32x4, 2) + vgetq_lane_u32(sumOfChannelG_32x4, 3);

// simillarily calculate others
}

0 个答案:

没有答案