void div_tl_128(unsigned char* data_mat, int b, int Matrix_Size)
{
int k = 0;
int count = Matrix_Size >> Bytes_Shift;
if(count == 0)
return;
__asm__ __volatile__(
"lsl r1, #4 \n"
"vld1.8 {d0-d1}, [r5] \n"
"vld1.8 {d2-d3}, [r3,+r1] \n"
"vld1.8 {d4-d5}, [r4,+r1] \n"
"l2: \n"
"vld1.8 {d6-d7}, [r0] \n"
"vld1.8 {d8-d9}, [r0] \n"
"vshr.u8 q3, #4 \n"
"vand.u8 q4, q0 \n"
"vtbl.u8 d6, {q1}, d6 \n"
"vtbl.u8 d7, {q1}, d7 \n"
"vtbl.u8 d8, {q2}, d8 \n"
"vtbl.u8 d9, {q2}, d9 \n"
"veor.u8 q3, q3, q4 \n"
"vst1.8 {d6-d7}, [r0] \n"
"adds r0, r0, #16 \n"
"subs r2, r2, #1 \n"
"bne l2 \n"
:
: "r"(data_mat), "r"(b), "r"(count), "r"(div_result_high2), "r"(div_result_low2), "r"(mask)
:
);
}
div_result_high2和low2表在其他头文件中声明。 mask是一个128位的数组,每个8位元素是0x0f
此代码应该执行查表分区,但它不起作用。我哪里错了?