我有一个uint32x4_t ARM NEON向量寄存器。我想用这个4 uint32_t“洗牌” vtbx2和vext
表查找内在函数的接口期望uint8x8_t
。这似乎可以用cast来实现,特别是因为文档说明了
“[...]转换不会更改向量表示的位模式。”
我尝试使用以下代码:
#include <iostream>
#include <arm_neon.h>
#include <bitset>
int main() {
uint32_t* data = new uint32_t[4];
uint32_t* result = new uint32_t[4];
//00 00 0A 0A
data[0] = 2570;
//00 0A 00 0A
data[1] = 655370;
//0A 0A 0A 0A
data[2] = 168430090;
//00 00 00 0A
data[3] = 10;
//load data
uint32x4_t dataVec = vld1q_u32(data);
//cast to uint8
uint8x16_t dataVecByteTmp = vreinterpretq_u8_u32(dataVec);
uint32_t* tmpData = new uint32_t[4];
//store original data
vst1q_u32(tmpData, dataVec);
std::cout << "Orig Data:" << std::endl;
for(int i = 0; i < 4; ++i) {
std::bitset<32> f(tmpData[i]);
std::cout << f << std::endl;
}
uint8_t* high = new uint8_t[16];
//store uint8 data
vst1q_u8(high, dataVecByteTmp);
std::cout << "unsigned output" << std::endl;
for(int i = 0; i < 16; ++i) {
std::cout << (unsigned)high[i] << std::endl;
}
std::cout << "bitwise output" << std::endl;
for(int i = 0; i < 16; ++i) {
std::bitset<8> b(high[i]);
std::cout << b << std::endl;
}
delete[] tmpData;
delete[] high;
delete[] data;
delete[] result;
return 0;
}
可以用:
编译它g ++ -march = native -mfpu = neon -std = c ++ 14 main.cpp
输出如下所示:
Orig Data:
00000000000000000000101000001010
00000000000010100000000000001010
00001010000010100000101000001010
00000000000000000000000000001010
unsigned output
10
10
0
0
10
0
10
0
10
10
10
10
10
0
0
0
bitwise output
00001010
00001010
00000000
00000000
00001010
00000000
00001010
00000000
00001010
00001010
00001010
00001010
00001010
00000000
00000000
00000000
为了更好地概述,我稍微更改了格式:
Orig (uint32_t):
00000000 00000000 00001010 00001010
00000000 00001010 00000000 00001010
00001010 00001010 00001010 00001010
00000000 00000000 00000000 00001010
New (uint8_t):
10 10 0 0
10 0 10 0
10 10 10 10
10 0 0 0
New (uint8_t bitwise):
00001010 00001010 00000000 00000000
00001010 00000000 00001010 00000000
00001010 00001010 00001010 00001010
00001010 00000000 00000000 00000000
正如人们所看到的,结果并不像预期的那样。有谁知道我做错了什么还是这只是一个错误?
此致