我尝试对图像进行卷积,以使用SSE2和C进行小波分解。此图像有4个通道(Lab + alpha)连续存储在内存中:[LabA] [LabA] [LabA] ... Alpha通道与我无关在这里做。
访问像素非常简单,只需加载一个指针的内容,该指针的内容会增加4:
static void eaw_decompose_sse2(float *const out,
const float *const in,
float *const detail,
const int scale,
const float sharpen,
const size_t width,
const size_t height)
{
/* Convolve rows */
#ifdef _OPENMP
#pragma omp parallel for collapse(2)
#endif
for(size_t j = 0; j < height; j++)
{
for(size_t i = 0; i < width; i++)
{
const size_t inc = (j * width + i) * 4;
float *pdetail = detail + inc;
float *pcoarse = tmp + inc;
// pixel to be convolved
const __m128 pin0 = _mm_load_ps(in + inc);
const __m128 w_0 = _mm_set1_ps(filter[2]);
// neighbours
const __m128 pin1 = _mm_load_ps(in + ASAN_ROW(i, j, -2, mult, max_height_i, width));
const __m128 pin2 = _mm_load_ps(in + ASAN_ROW(i, j, -1, mult, max_height_i, width));
const __m128 pin3 = _mm_load_ps(in + ASAN_ROW(i, j, 1, mult, max_height_i, width));
const __m128 pin4 = _mm_load_ps(in + ASAN_ROW(i, j, 2, mult, max_height_i, width));
// neighbours contribution
const __m128 w_1 = _mm_set1_ps(filter[0]) * weight_sse2(pin0, pin1, sharpen);
const __m128 w_2 = _mm_set1_ps(filter[1]) * weight_sse2(pin0, pin2, sharpen);
const __m128 w_3 = _mm_set1_ps(filter[3]) * weight_sse2(pin0, pin2, sharpen);
const __m128 w_4 = _mm_set1_ps(filter[4]) * weight_sse2(pin0, pin3, sharpen);
// Filter computation
const __m128 wgt = w_1 + w_2 + w_3 + w_4 + w_0;
const __m128 sum = (w_1 * pin1 + w_2 * pin2 + w_3 * pin3 + w_4 * pin4 + w_0 * pin0) * _mm_rcp_ps(wgt);
// High frequency layer
_mm_stream_ps(pdetail, pin0 - sum);
// Low frequency layer
_mm_stream_ps(pcoarse, sum);
}
}
}
函数ASAN_ROW
使指针沿行滑动,以确保我们停留在边界内,如果没有,它将占用最近的邻居。 weight_sse2
是高斯权重,由于L和a / b的权重不同,因此会执行复杂的位移位。
因此,我不想在最后一个元素丢失的情况下对4个Lab SSE向量进行操作,而是对3个SSE vecor进行操作会更快,每个向量是一个Lab通道,每个元素都是一个相邻像素。这样就变成了:
static void eaw_decompose_sse2(float *const out,
const float *const in,
float *const detail,
const int scale,
const float sharpen,
const size_t width,
const size_t height)
{
/* Convolve rows */
#ifdef _OPENMP
#pragma omp parallel for collapse(2)
#endif
for(size_t j = 0; j < height; j++)
{
for(size_t i = 0; i < width; i++)
{
const size_t inc = (j * width + i) * 4;
float *pdetail = detail + inc;
float *pcoarse = tmp + inc;
// pixel to be convolved
const __m128 pin0 = _mm_load_ps(in + inc);
const __m128 w_0 = _mm_set1_ps(filter[2]);
// neighbours
const __m128 pin1 = _mm_load_ps(in + ASAN_ROW(i, j, -2, mult, max_height_i, width));
const __m128 pin2 = _mm_load_ps(in + ASAN_ROW(i, j, -1, mult, max_height_i, width));
const __m128 pin3 = _mm_load_ps(in + ASAN_ROW(i, j, 1, mult, max_height_i, width));
const __m128 pin4 = _mm_load_ps(in + ASAN_ROW(i, j, 2, mult, max_height_i, width));
// Lab extraction - pixel to be convolved
__m128 L_0 = _mm_set1_ps( pin0[0] ); // ?
__m128 a_0 = _mm_set1_ps( pin0[1] ); // ?
__m128 b_0 = _mm_set1_ps( pin0[2] ); // ?
// Lab extraction - neighbours
__m128 L_f = _mm_set_ps ({ pin1[0], pin2[0], pin3[0], pin4[0] }); // ?
__m128 a_f = _mm_set_ps ({ pin1[1], pin2[1], pin3[1], pin4[1] }); // ?
__m128 b_f = _mm_set_ps ({ pin1[2], pin2[2], pin3[2], pin4[2] }); // ?
// neighbours contribution
const __m128 filter = _mm_load_ps(filter_coeff);
const __m128 w_L = filter * weight_sse(L_0, L_f, sharpen);
const __m128 w_c = filter * weight_sse(a_0 + b_0, a_f + b_f, sharpen);
// Filter computation
const __m128 wgt = _mm_set_ps( { sum_of_elts_sse(w_L),
sum_of_elts_sse(w_c),
sum_of_elts_sse(w_c),
0.0f } );
const __m128 w1 = _mm_set_ps ({ w_L[0], w_c[0], w_c[0], 0.0f }); // ?
const __m128 w2 = _mm_set_ps ({ w_L[1], w_c[1], w_c[1], 0.0f }); // ?
const __m128 w3 = _mm_set_ps ({ w_L[2], w_c[2], w_c[2], 0.0f }); // ?
const __m128 w4 = _mm_set_ps ({ w_L[3], w_c[3], w_c[3], 0.0f }); // ?
const __m128 sum = (w_1 * pin1 + w_2 * pin2 + w_3 * pin3 + w_4 * pin4 + w_0 * pin0) * _mm_rcp_ps(wgt);
// High frequency layer
_mm_stream_ps(pdetail, pin0 - sum);
// Low frequency layer
_mm_stream_ps(pcoarse, sum);
}
}
}
从基于通道的矢量(像素矢量pin0
到pin4
)切换到基于邻居的矢量(L_0
,{{1 }}),以及从L_f
-w_L
到w_c
-w_1
的另一种方式呢?第二个版本会更快吗?