我试图有效地从rgb转换为灰度,所以我从here获得了一个函数,它解释了如何从rgba转换为灰度。现在我尝试做同样的事情,但只是rgb。我改变了一些东西,但似乎效果不好。我不知道为什么,有人看到我的错误吗?
void neon_asm_convert(uint8_t * __restrict dest, uint8_t * __restrict src, int numPixels)
{
__asm__ volatile(
"lsr %2, %2, #3 \n"
"# build the three constants: \n"
"mov r4, #28 \n" // Blue channel multiplier
"mov r5, #151 \n" // Green channel multiplier
"mov r6, #77 \n" // Red channel multiplier
"vdup.8 d4, r4 \n"
"vdup.8 d5, r5 \n"
"vdup.8 d6, r6 \n"
"0: \n"
"# load 8 pixels: \n" //RGBR
"vld4.8 {d0-d3}, [%1]! \n"
"# do the weight average: \n"
"vmull.u8 q7, d0, d4 \n"
"vmlal.u8 q7, d1, d5 \n"
"vmlal.u8 q7, d2, d6 \n"
"# shift and store: \n"
"vshrn.u16 d7, q7, #8 \n" // Divide q3 by 256 and store in the d7
"vst1.8 {d7}, [%0]! \n"
"subs %2, %2, #1 \n" // Decrement iteration count
"# load 8 pixels: \n"
"vld4.8 {d8-d11}, [%1]! \n" //Other GBRG
"# do the weight average: \n"
"vmull.u8 q7, d3, d4 \n"
"vmlal.u8 q7, d8, d5 \n"
"vmlal.u8 q7, d9, d6 \n"
"# shift and store: \n"
"vshrn.u16 d7, q7, #8 \n" // Divide q3 by 256 and store in the d7
"vst1.8 {d7}, [%0]! \n"
"subs %2, %2, #1 \n" // Decrement iteration count
"# load 8 pixels: \n"
"vld4.8 {d0-d3}, [%1]! \n"
"# do the weight average: \n"
"vmull.u8 q7, d10, d4 \n"
"vmlal.u8 q7, d11, d5 \n"
"vmlal.u8 q7, d0, d6 \n"
"# shift and store: \n"
"vshrn.u16 d7, q7, #8 \n" // Divide q3 by 256 and store in the d7
"vst1.8 {d7}, [%0]! \n"
"subs %2, %2, #1 \n" // Decrement iteration count
"# do the weight average: \n"
"vmull.u8 q7, d1, d4 \n"
"vmlal.u8 q7, d2, d5 \n"
"vmlal.u8 q7, d3, d6 \n"
"# shift and store: \n"
"vshrn.u16 d7, q7, #8 \n" // Divide q3 by 256 and store in the d7
"vst1.8 {d7}, [%0]! \n"
"subs %2, %2, #1 \n" // Decrement iteration count
"bne 0b \n" // Repeat unil iteration count is not zero
:
: "r"(dest), "r"(src), "r"(numPixels)
: "r4", "r5", "r6"
);
}
答案 0 :(得分:2)
您应该使用"vld3.8 {d0-d2}, [%1]! \n"
答案 1 :(得分:1)
加载四个值(RGBA)而不是3(RGB)。
您的图片中有RGB RGB RGB
,但您会在连续的步骤中加载RGBR GBRG B...
等。
"vld4.8 {d0-d3}, [%1]! \n"
相反,你应该
"vld3.8 {d0-d2}, [%1]! \n"
请注意,我不知道我的asm
是否正确,但这是错误的。
将像素移回内存时也要检查同样的错误
答案 2 :(得分:1)
瓦西里是对的。使用VLD3加载24位像素。
4 VSTx还有3个VLDx 事实上你的代码很奇怪......
您不必复制代码。这个解释起来很复杂,但你不会对NEON重复4次你的代码感兴趣
void neon_asm_convert(uint8_t * __restrict dest, uint8_t * __restrict src, int numPixels)
{
__asm__ volatile(
"# build the three constants: \n"
"mov r4, #28 \n" // Blue channel multiplier
"mov r5, #151 \n" // Green channel multiplier
"mov r6, #77 \n" // Red channel multiplier
"vdup.8 d4, r4 \n"
"vdup.8 d5, r5 \n"
"vdup.8 d6, r6 \n"
"0: \n"
"# load 8 pixels: \n" //RGBR
"vld3.8 {d0-d2}, [%1]! \n"
"# do the weight average: \n"
"vmull.u8 q7, d0, d4 \n"
"vmlal.u8 q7, d1, d5 \n"
"vmlal.u8 q7, d2, d6 \n"
"# shift and store: \n"
"vshrn.u16 d7, q7, #8 \n" // Divide q3 by 256 and store in the d7
"vst1.8 {d7}, [%0]! \n"
"subs %2, %2, #1 \n" // Decrement iteration count
"bne 0b \n" // Repeat unil iteration count is not zero
:
: "r"(dest), "r"(src), "r"(numPixels)
: "r4", "r5", "r6"
);
}
应该有效。