转置SSE2向量

时间:2018-12-26 18:17:22

标签: c sse

我尝试对图像进行卷积,以使用SSE2和C进行小波分解。此图像有4个通道(Lab + alpha)连续存储在内存中:[LabA] [LabA] [LabA] ... Alpha通道与我无关在这里做。

访问像素非常简单,只需加载一个指针的内容,该指针的内容会增加4:

static void eaw_decompose_sse2(float *const out, 
                               const float *const in, 
                               float *const detail, 
                               const int scale,
                               const float sharpen, 
                               const size_t width, 
                               const size_t height)
{
/* Convolve rows */
#ifdef _OPENMP
#pragma omp parallel for collapse(2)
#endif
  for(size_t j = 0; j < height; j++)
  {
    for(size_t i = 0; i < width; i++)
    {
      const size_t inc = (j * width + i) * 4;
      float *pdetail = detail + inc;
      float *pcoarse = tmp + inc;

      // pixel to be convolved
      const __m128 pin0 = _mm_load_ps(in + inc);
      const __m128 w_0 = _mm_set1_ps(filter[2]);

      // neighbours
      const __m128 pin1 = _mm_load_ps(in + ASAN_ROW(i, j, -2, mult, max_height_i, width));
      const __m128 pin2 = _mm_load_ps(in + ASAN_ROW(i, j, -1, mult, max_height_i, width));
      const __m128 pin3 = _mm_load_ps(in + ASAN_ROW(i, j, 1, mult, max_height_i, width));
      const __m128 pin4 = _mm_load_ps(in + ASAN_ROW(i, j, 2, mult, max_height_i, width));

      // neighbours contribution
      const __m128 w_1 = _mm_set1_ps(filter[0]) * weight_sse2(pin0, pin1, sharpen);
      const __m128 w_2 = _mm_set1_ps(filter[1]) * weight_sse2(pin0, pin2, sharpen);
      const __m128 w_3 = _mm_set1_ps(filter[3]) * weight_sse2(pin0, pin2, sharpen);
      const __m128 w_4 = _mm_set1_ps(filter[4]) * weight_sse2(pin0, pin3, sharpen);

      // Filter computation
      const __m128 wgt = w_1 + w_2 + w_3 + w_4 + w_0;
      const __m128 sum = (w_1 * pin1 + w_2 * pin2 + w_3 * pin3 + w_4 * pin4 + w_0 * pin0) * _mm_rcp_ps(wgt);

      // High frequency layer
      _mm_stream_ps(pdetail, pin0 - sum);

      // Low frequency layer
      _mm_stream_ps(pcoarse, sum);
    }
  }
}

函数ASAN_ROW使指针沿行滑动,以确保我们停留在边界内,如果没有,它将占用最近的邻居。 weight_sse2是高斯权重,由于L和a / b的权重不同,因此会执行复杂的位移位。

因此,我不想在最后一个元素丢失的情况下对4个Lab SSE向量进行操作,而是对3个SSE vecor进行操作会更快,每个向量是一个Lab通道,每个元素都是一个相邻像素。这样就变成了:

static void eaw_decompose_sse2(float *const out, 
                               const float *const in, 
                               float *const detail, 
                               const int scale,
                               const float sharpen, 
                               const size_t width, 
                               const size_t height)
{
/* Convolve rows */
#ifdef _OPENMP
#pragma omp parallel for collapse(2)
#endif
  for(size_t j = 0; j < height; j++)
  {
    for(size_t i = 0; i < width; i++)
    {
      const size_t inc = (j * width + i) * 4;
      float *pdetail = detail + inc;
      float *pcoarse = tmp + inc;

      // pixel to be convolved
      const __m128 pin0 = _mm_load_ps(in + inc);
      const __m128 w_0 = _mm_set1_ps(filter[2]);

      // neighbours
      const __m128 pin1 = _mm_load_ps(in + ASAN_ROW(i, j, -2, mult, max_height_i, width));
      const __m128 pin2 = _mm_load_ps(in + ASAN_ROW(i, j, -1, mult, max_height_i, width));
      const __m128 pin3 = _mm_load_ps(in + ASAN_ROW(i, j, 1, mult, max_height_i, width));
      const __m128 pin4 = _mm_load_ps(in + ASAN_ROW(i, j, 2, mult, max_height_i, width));

      // Lab extraction - pixel to be convolved
      __m128 L_0 =  _mm_set1_ps( pin0[0] ); // ?
      __m128 a_0 =  _mm_set1_ps( pin0[1] ); // ?
      __m128 b_0 =  _mm_set1_ps( pin0[2] ); // ?

      // Lab extraction - neighbours
      __m128 L_f = _mm_set_ps ({ pin1[0], pin2[0], pin3[0], pin4[0] }); // ?
      __m128 a_f = _mm_set_ps ({ pin1[1], pin2[1], pin3[1], pin4[1] }); // ?
      __m128 b_f = _mm_set_ps ({ pin1[2], pin2[2], pin3[2], pin4[2] }); // ?

      // neighbours contribution
      const __m128 filter = _mm_load_ps(filter_coeff);
      const __m128 w_L = filter * weight_sse(L_0, L_f, sharpen);
      const __m128 w_c = filter * weight_sse(a_0 + b_0, a_f + b_f, sharpen);

      // Filter computation
      const __m128 wgt = _mm_set_ps( { sum_of_elts_sse(w_L), 
                                       sum_of_elts_sse(w_c), 
                                       sum_of_elts_sse(w_c), 
                                       0.0f } );
      const __m128 w1 = _mm_set_ps ({ w_L[0], w_c[0], w_c[0], 0.0f }); // ?
      const __m128 w2 = _mm_set_ps ({ w_L[1], w_c[1], w_c[1], 0.0f }); // ?
      const __m128 w3 = _mm_set_ps ({ w_L[2], w_c[2], w_c[2], 0.0f }); // ?
      const __m128 w4 = _mm_set_ps ({ w_L[3], w_c[3], w_c[3], 0.0f }); // ?
      const __m128 sum = (w_1 * pin1 + w_2 * pin2 + w_3 * pin3 + w_4 * pin4 + w_0 * pin0) * _mm_rcp_ps(wgt);

      // High frequency layer
      _mm_stream_ps(pdetail, pin0 - sum);

      // Low frequency layer
      _mm_stream_ps(pcoarse, sum);
    }
  }
}

从基于通道的矢量(像素矢量pin0pin4)切换到基于邻居的矢量(L_0,{{1 }}),以及从L_f-w_Lw_c-w_1的另一种方式呢?第二个版本会更快吗?

0 个答案:

没有答案