我想使用ARM Neon将8位灰度图像从1280x960调整到320x240。
作为一个例子,我已经从640x480到320x240调整了2倍:
void divideimageby2(uint8_t * src, uint8_t * dest) {
//src is 640 x 480
//dst is 320 x 240
int h;
for (h = 0; h < 240; h++)
resizeline2(src + 640 * (h * 2 + 0), src + 640 * (h * 2 + 1), dt + 320 * h);
}
void resizeline2(uint8_t * __restrict src1, uint8_t * __restrict src2, uint8_t * __restrict dest) {
int w;
for (w = 0; w < 640; w += 16) {
uint16x8_t a = vpaddlq_u8(vld1q_u8(src1));
uint16x8_t b = vpaddlq_u8(vld1q_u8(src2));
uint16x8_t ab = vaddq_u16(a, b);
vst1_u8(dest, vshrn_n_u16(ab, 2));
src1 += 16;
src2 += 16;
dest += 8;
}
}
如果我想做类似的事情,我可以在resizeline4中使用哪种霓虹灯指令聚合4行?
void divideimageby4(uint8_t * src, uint8_t * dest) {
//src is 1280 x 960
//dst is 320 x 240
int h;
for (h = 0; h < 240; h++)
resize_line2(src + 640 * (h * 4 + 0), src + 640 * (h * 4 + 1), src + 640 * (h * 4 + 2), src + 640 * (h * 4 + 3), dt + 320 * h);
}
void resizeline4(uint8_t * __restrict src1, uint8_t * __restrict src2, uint8_t * __restrict src3, uint8_t * __restrict src4, uint8_t * __restrict dest) {
int w;
for (w = 0; w < 1280; w += 16) {
//What to put here?
src1 += 16;
src2 += 16;
src3 += 16;
src4 += 16;
dest += 4;
}
}
答案 0 :(得分:2)
您应该将vpaddl与vpadal结合使用。
在q寄存器line1a,line1b ..... line4b
中加载32 * 4矩阵vpaddl.u8 line1a,line1a
vpaddl.u8 line1b,line1b
vpadal.u8 line1a,line2a
vpadal.u8 line1b,line2b
vpadal.u8 line1b,line4b
vpadd.u16 d0,line1alow,line1ahigh
vpadd.u16 d1,line1blow,line1bhigh
vrshrn.u16 d0,q0,#4
vst1.8 {d0},[pDst]!