我用__m128i表示一个位域,需要一种快速方法来检查是否设置了一个特定位,以及设置一个特定位的方法。我是否必须将另一个__m128i设置为掩码并将它们设置为OR,或者是否存在我丢失的指令更快?我正在使用英特尔编译器。
答案 0 :(得分:3)
你可以尝试这样的事情。我不相信有更快的方法。您可能希望从代码的性能关键部分中提取一些常量值和表。
__m128i v; // todo: set v to something here
// to check
int n; // todo: set n to the zero-indexed bit to check
__m128i chkmask = _mm_slli_epi16(_mm_set1_epi16(1), n & 0xF);
int movemask = (1 << (n >> 3));
int isSet = (_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(chkmask, v), _mm_setzero_si128())) & movemask) ^ movemask;
// to set
int m; // todo: set m to the zero-indexed bit to set
__m128i shuf = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
shuf = _mm_add_epi8(shuf, _mm_set1_epi8(16 - (m >> 3)));
shuf = _mm_and_si128(shuf, _mm_set1_epi8(0x0F));
__m128i setmask = _mm_shuffle_epi8(_mm_cvtsi32_si128(1 << (m & 0x7)), shuf);
v = _mm_or_si128(v, setmask);
// or to try the look-up table approach to check and set
__declspec(align(16)) __m128i lut[] = {
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000001),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000002),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000004),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000008),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000010),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000020),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000040),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000080),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000100),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000200),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000400),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000800),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00001000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00002000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00004000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00008000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00010000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00020000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00040000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00080000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00100000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00200000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00400000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00800000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x01000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x02000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x04000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x08000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x10000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x20000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x40000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x80000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000001, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000002, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000004, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000008, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000010, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000020, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000040, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000080, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000100, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000200, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000400, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00000800, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00001000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00002000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00004000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00008000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00010000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00020000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00040000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00080000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00100000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00200000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00400000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x00800000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x01000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x02000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x04000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x08000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x10000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x20000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x40000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000000, 0x80000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000001, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000002, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000004, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000008, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000010, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000020, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000040, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000080, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000100, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000200, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000400, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00000800, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00001000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00002000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00004000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00008000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00010000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00020000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00040000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00080000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00100000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00200000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00400000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x00800000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x01000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x02000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x04000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x08000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x10000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x20000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x40000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000000, 0x80000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000001, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000002, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000004, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000008, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000010, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000020, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000040, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000080, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000100, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000200, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000400, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00000800, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00001000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00002000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00004000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00008000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00010000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00020000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00040000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00080000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00100000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00200000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00400000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x00800000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x01000000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x02000000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x04000000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x08000000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x10000000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x20000000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x40000000, 0x00000000, 0x00000000, 0x00000000),
_mm_set_epi32(0x80000000, 0x00000000, 0x00000000, 0x00000000)
};
// to check with look-up table
movemask = (1 << (n >> 3));
isSet = (_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(v, _mm_load_si128(lut + m)), _mm_setzero_si128())) & movemask) ^ movemask;
// to set with look-up table
v = _mm_or_si128(v, _mm_load_si128(lut + m));
答案 1 :(得分:1)
这里有什么价值,我想出了一些变化来测试一下。如果掩码和一个resister可以预先计算,那么这只需要三个内在函数。
为了设置单个位,我认为没有一种有效的方法。以下是关于从movemask返回到SSE寄存器How to perform the inverse of _mm256_movemask_epi8 (VPMOVMSKB)?
的讨论#include <emmintrin.h>
#include <stdio.h>
int main() {
__m128i x = _mm_setr_epi32(0,0,0,1);
__m128i mask = _mm_setr_epi32(0,0,0,1);
__m128i one = _mm_set1_epi8(1);
int isSet = 0xffff != _mm_movemask_epi8(_mm_sub_epi8(_mm_and_si128(x,mask),one));
printf("%X\n", isSet);
}
编辑实际上有一种更快捷的方法可以使用_mm_testz_si128
检查SSE4.1。
#include <smmintrin.h>
#include <stdio.h>
int main() {
__m128i x = _mm_setr_epi32(0,0,0,1);
__m128i mask = _mm_setr_epi32(0,0,0,1);
__m128i t = _mm_and_si128(x,mask);
int isSet = !_mm_testz_si128(t,t);
printf("%d\n", isSet);
}
答案 2 :(得分:0)
在__m128i
中设置单个位没有instructions。
您可以尝试使用通用BTS
指令,但它可能比制作一个掩码慢,因为它只能写入内存(或32位寄存器,这没有帮助)