我正在尝试在霓虹SIMD中优化我的算法。
标量代码:
for (int y = 1; y < (height - 1); y++) {
int height_offs = y * width;
for (int x = 1; x < (width - 1); x++) {
dst[height_offs + x] = (
src[(height_offs - width) + (x + 1)] -
src[(height_offs - width) + (x - 1)] +
// current row
src[(height_offs) + (x + 1)] -
src[(height_offs) + (x - 1)] +
// next row
src[(height_offs + width) + (x + 1)] -
src[(height_offs + width) + (x - 1)]
);
}
}
但我发现宽度和高度参数不是常数。这些值通过外部迭代而变化。例如,宽度= 7/8/9/10/11/12/13(/手段或)。所以我想提取宽度为%4的提醒,并为剩余的提醒值计算dst[height_offs + x]
。例如,如果宽度为7,则表示宽度循环从0,1,2,3,4,5,6开始。因此,我可以将矢量处理为0,1,2,3,剩余4,5,6,其中7作为副本6.但是对于3,2,1的每个提醒值,我必须编写一个单独的实现,这将影响性能,我将失去SIMD的功能。我想问下面的问题。
注意:我正在优化Lucas Kanade Algo,我必须在每个金字塔等级进行衍生计算。因此,图像的高度和宽度发生了变化。因此,我无法避免更改值宽度和高度。
矢量代码:(无法正常工作。需要在内部进行一些更改)
uint32x4_t src_vector32x4__prev0,src_vector32x4__prev1,src_vector32x4__prev2;
uint32x4_t src_vector32x4__curr0,src_vector32x4__curr1,src_vector32x4__curr2;
uint32x4_t src_vector32x4__next0,src_vector32x4__next1,src_vector32x4__next2;
uint32x4_t s1,s2,s3,add_32x4u;
int32x4_t dst_vector32x4;
uint32_t Idx ;
/* Load destination data into the destination vector register. */
dst_vector32x4 = vld1q_s32(dst);
for (int32_t y = 1; y < (src_height - 1); ++y) {
int32_t height_offs = y * src_width;
for (int32_t x = 1; x < (src_width - 1); x+=4) {
Idx = height_offs + x ;
src_vector32x4__prev0 = vld1q_u32(&src[(height_offs - src_width) + (x - 1)]); // 00 01 02 03
src_vector32x4__prev2 = vld1q_u32(&src[(height_offs - src_width) + (x + 1)]); // 02 03 04 05
src_vector32x4__curr0 = vld1q_u32(&src[(height_offs) + (x - 1)]); // 10 11 12 13
src_vector32x4__curr2 = vld1q_u32(&src[(height_offs) + (x + 1)]); // 12 13 14 15
src_vector32x4__next0 = vld1q_u32(&src[(height_offs + src_width) + (x - 1)]); // 20 21 22 23
src_vector32x4__next2 = vld1q_u32(&src[(height_offs + src_width) + (x + 1)]); // 22 23 24 25
//src_vector32x4__prev1 = vld1q_u32(&src[(height_offs - src_width) + (x)]); // 01 02 03 04
//src_vector32x4__curr1 = vld1q_u32(&src[(height_offs) + (x)]); // 11 12 13 14
//src_vector32x4__next1 = vld1q_u32(&src[(height_offs + src_width) + (x)]); // 21 22 23 24
s1 = vsubq_u32(src_vector32x4__prev2,src_vector32x4__prev0);
s2 = vsubq_u32(src_vector32x4__curr2,src_vector32x4__curr0);
s3 = vsubq_u32(src_vector32x4__next2,src_vector32x4__next0);
add_32x4u = vaddq_u32(s1,vaddq_u32(s2,s3));
dst_vector32x4 = vreinterpretq_s32_u32(add_32x4u);
/*Store a destination vector into destination buffer*/
vst1q_s32(&dst[Idx],dst_vector32x4);
}
}