我想将双精度值的矢量转换为char。 我必须制作两种不同的方法,一种用于SSE2,另一种用于AVX2。
我从AVX2开始。
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<div class="Content">
Version=2
Format=jpg
Size=1280x960
Date=2013/05/08
Time=23:49:40
Value=250000
</div>
<br>
<div id="result">test</div>
在此算法结束时,输出包含:
__m128i sub_proc(__m256d& in)
{
__m256d _zero_pd = _mm256_setzero_pd();
__m256d ih_pd = _mm256_unpackhi_pd(in,_zero_pd);
__m256d il_pd = _mm256_unpacklo_pd(in,_zero_pd);
__m128i ih_si = _mm256_cvtpd_epi32(ih_pd);
__m128i il_si = _mm256_cvtpd_epi32(il_pd);
ih_si = _mm_shuffle_epi32(ih_si,_MM_SHUFFLE(3,1,2,0));
il_si = _mm_shuffle_epi32(il_si,_MM_SHUFFLE(3,1,2,0));
ih_si = _mm_packs_epi32(_mm_unpacklo_epi32(il_si,ih_si),_mm_unpackhi_epi32(il_si,ih_si));
return ih_si;
}
__m128i proc(__m256d& in1,__m256d& in2)
{
__m256d _zero_pd = _mm_setzeros_pd();
__m128i in1_si = sub_proc(in1);
__m128i in2_si = sub_proc(in2);
return _mm_packs_epi16(in1_si,in2_si);
}
int main()
{
double input[32] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32};
char output[32] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
char check[8];
double* ibeg = input;
char* obeg = output;
for(int i=0;i<32;i+=8)
{
__m256d in1 = _mm256_loadu_pd(ibeg);
__m256d in2 = _mm256_loadu_pd(ibeg+4);
__m128i tmp = proc(in1,in2);
_mm_storeu_si128(reinterpret_cast<__m128i*>(check),tmp);
std::copy(check,check+8,std::ostream_iterator<int>(std::cout," "));
std::cout<<std::endl;
_mm_storeu_si128(reinterpret_cast<__m128i*>(obeg+i),tmp);
}
}
我的第一次调查显示,如果在函数1,2,3,4,0,0,0,0,9,10,11,12,0,0,0,0,17,18,19,20,0,0,0,0,25,26,27,28,0,0,0,0
中我改变了:
proc
为:
return _mm_packs_epi16(in1_si,in2_si);
然后输出包含:
return _mm_packs_epi16(in2_si,in1_si);
我还没有弄清楚如何改变5,6,7,8,0,0,0,0,13,14,15,16,0,0,0,0,21,22,23,24,0,0,0,0,29,30,31,31,0,0,0,0
的低点和高点。
使用SIMD将双精度数转换为char是否有更好(更快,更有效)的方法?
答案 0 :(得分:5)
如果您想转换,例如使用AVX / SSE每次迭代16 double
到16 char
,然后这里有一些有用的代码:
#include <iostream>
#include <immintrin.h>
__m128i proc(const __m256d in0, const __m256d in1, const __m256d in2, const __m256d in3)
{
__m128i v0 = _mm256_cvtpd_epi32(in0);
__m128i v1 = _mm256_cvtpd_epi32(in1);
__m128i v2 = _mm256_cvtpd_epi32(in2);
__m128i v3 = _mm256_cvtpd_epi32(in3);
__m128i v01 = _mm_packs_epi32(v0, v1);
__m128i v23 = _mm_packs_epi32(v2, v3);
return _mm_packs_epi16(v01, v23);
}
int main()
{
double input[32] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32};
char output[32] = {0};
for (int i = 0; i < 32; i += 16) // two iterations
{
__m256d in0 = _mm256_loadu_pd(&input[i]); // load 4 x 4 doubles
__m256d in1 = _mm256_loadu_pd(&input[i + 4]);
__m256d in2 = _mm256_loadu_pd(&input[i + 8]);
__m256d in3 = _mm256_loadu_pd(&input[i + 12]);
__m128i out = proc(in0, in1, in2, in3); // pack to 16 chars
_mm_storeu_si128(reinterpret_cast<__m128i*>(&output[i]), out);
}
for (int i = 0; i < 32; ++i)
{
std::cout << (int)output[i] << " ";
}
std::cout << std::endl;
return 0;
}
编译并运行:
$ g++ -Wall -mavx double_to_char.cpp && ./a.out
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
请注意,上面的代码只需要AVX(不需要AVX2)。