我试图对某些代码进行矢量化。 想法:我们有一个像素(__ m128 in),如果它的任何元素大于upper,用不同的像素替换 entier 像素(__ m128 upper_color)
无效的无效代码:
if(inp[0] >= upper || inp[1] >= upper || inp[2] >= upper)
{
outp[0] = upper_color[0];
outp[1] = upper_color[1];
outp[2] = upper_color[2];
}
到目前为止,我提出了以下内容,但(我相信如此)它不会取代整个像素,而只会取代那些比上面更大的组件:
const __m128 pixel = _mm_load_ps(in);
const __m128 isoe = _mm_cmpge_ps(pixel, upper);
__m128 result = _mm_or_ps(_mm_andnot_ps(isoe, pixel),
_mm_and_ps(isoe, upper_color));
_mm_stream_ps(out, result);
假设upper = 1,1,1和upper_color = 1,0,0
第四个频道是alpha,所以我不关心它。
结果:
IN: 0.5 0.3 0.7
OUT: 0.5 0.3 0.7 (Expected)
OUT: 0.5 0.3 0.7 (Recieved)
IN: 1.5 1.1 0.7
OUT: 1 0 0 (Expected)
OUT: 1 0 0.7 (Recieved)
也许有人可以帮助我?这有可能吗?
答案 0 :(得分:3)
您需要计算水平OR。 SSE
中没有水平OR指令,但可以使用2x UNPACK
+垂直OR
模拟此类操作。
const __m128 pixel = _mm_load_ps(in);
/* (p3, p2, p1, p0 ) */
__m128 isoe = _mm_cmpge_ps(pixel, upper);
/* (p3|p1, p2|p0, p3|p1, p2|p0) */
isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));
/* (p3|p2|p1|p0, p3|p2|p1|p0, p3|p2|p1|p0, p3|p2|p1|p0) */
isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));
__m128 result = _mm_or_ps(_mm_andnot_ps(isoe, pixel), _mm_and_ps(isoe, upper_color));
_mm_stream_ps(out, result);
答案 1 :(得分:1)
您可以使用_mm_movemask_epi8
进行横向OR。
#include <stdio.h>
#include <emmintrin.h>
void foo(float ina[]) {
//float ina[] = {0.5, 0.3, 0.7, 0};
float uppera[] = {1,1,1,1};
float upper_colora[] = {1,0,0,0};
float out[4];
__m128 in = _mm_load_ps(ina);
__m128 upper = _mm_load_ps(uppera);
__m128 upper_color = _mm_load_ps(upper_colora);
const __m128 pixel = _mm_load_ps(ina);
const __m128 isoe = _mm_cmpge_ps(pixel, upper);
if(_mm_movemask_epi8(_mm_castps_si128(isoe))) {
_mm_stream_ps(out, upper_color);
}
else {
_mm_stream_ps(out, in);
}
printf("%f %f %f %f\n", out[0], out[1], out[2], out[3]);
}
int main() {
float ina1[] = {0.5, 0.3, 0.7, 0}; //output 0.5 0.3 0.7 0.0
float ina2[] = {0.5, 1.1, 0.7, 0}; //output 1.0 0.0 0.0 0.0
foo(ina1);
foo(ina2);
}