在llvm ir中将bool的向量转换为整数

时间:2016-07-13 07:52:42

标签: llvm vectorization llvm-ir llvm-c++-api

我正在编写一个涉及向量操作的llvm-ir代码。我做了一个与'icmp'指令的整数向量比较,导致bools的矢量说< 8 x i1>,我的问题是我想将这8位转换为其对应的整数值而不遍历向量(从中提取元素)矢量),我试过'bitcast< 8 x i1>到i8'似乎将矢量的第一位转换为i8,如果错了,请纠正我。有人可以建议我这样做的方法。

define i8 @main() #0 {
   entry:
     %A = alloca [8 x i32], align 16
     %B = alloca [8 x i32], align 16
     %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* %A, i64 0, i64 0
     store i32 90, i32* %arrayidx, align 4
     %arrayidx1 = getelementptr inbounds [8 x i32], [8 x i32]* %A, i64 0, i64 1
     store i32 91, i32* %arrayidx1, align 4
     %arrayidx2 = getelementptr inbounds [8 x i32], [8 x i32]* %A, i64 0, i64 2
     store i32 92, i32* %arrayidx2, align 8
     %arrayidx3 = getelementptr inbounds [8 x i32], [8 x i32]* %A, i64 0, i64 3
     store i32 93, i32* %arrayidx3, align 4

     %arrayidx4 = getelementptr inbounds [8 x i32], [8 x i32]* %B, i64 0, i64 0
     store i32 90, i32* %arrayidx4, align 4
     %arrayidx5 = getelementptr inbounds [8 x i32], [8 x i32]* %B, i64 0, i64 1
     store i32 1, i32* %arrayidx5, align 4
     %arrayidx6 = getelementptr inbounds [8 x i32], [8 x i32]* %B, i64 0, i64 2
     store i32 92, i32* %arrayidx6, align 8
     %arrayidx7 = getelementptr inbounds [8 x i32], [8 x i32]* %B, i64 0, i64 3
     store i32 93, i32* %arrayidx7, align 4
     br label %vector.body
  vector.body:

     %0 = bitcast [8 x i32]* %A to <8 x i32>*
     %1 = bitcast [8 x i32]* %B to <8 x i32>*

     %2 = load <8 x i32>, <8 x i32>* %0
     %3 = load <8 x i32>, <8 x i32>* %1

     %4 = icmp eq <8 x i32> %2, %3

     %5 = bitcast <8 x i1> %4 to i8

     ret i8 %5;

}

使用'lli'运行此代码而没有任何标志。输出预计为11,但是得到1或0 非常感谢你。

2 个答案:

答案 0 :(得分:0)

据我所知,如果不调用平台特定的内在函数,就无法做到这一点。我注意到由于无法在c ++中编写目标独立代码。

例如,下面的代码:

typedef int v8i __attribute__((vector_size(32)));

int main() {
   v8i a = { 1, 2, 3, 4, 5, 6, 7, 8};
   v8i b = { 0, 2, 3, 4, 5, 6, 7, 0};
   v8i cmp = (a == b);
   char res = *(char*)&cmp;
   printf("%d\n", res);
   return 0;
}

生成llvm-IR,它与你所写的(与适当的bitcast)非常接近。

不幸的是,它并没有按预期工作。

那是因为&lt; 8 x i1&gt;在处理器上不存在。例如,在x86 AVX2中,_mm256_cmpeq_epi32产生__m256i。 将其绑定到char只会占用该寄存器的前8位。

我编写了intel AVX2特定代码,并找到了相应的指令:intel intrinsic guide

所以这段代码可以满足您的需求:

#include <cstdio>
#include <cstdlib>
#include <immintrin.h>

int main() {
   __m256i a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
   __m256i b = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 0);
   __m256i eq = _mm256_cmpeq_epi32(a, b);
   int res = _mm256_movemask_ps(_mm256_castsi256_ps(eq));

   printf("res = %d\n", res);
   for(int i = 0; i < 8; ++i) {
       printf("%d %d -> %d\n", _mm256_extract_epi32(a, i), _mm256_extract_epi32(b, i), !!((res << i) & 0x80));
   }
   return 0;
}

就ll代码而言,事实证明你需要一些额外的bitcast(浮动),以及对内在的调用

  

@ llvm.x86.avx.movmsk.ps.256

手动重写llvm-IR代码导致:

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"


@formatString = private constant [4 x i8] c"%d\0A\00"

define i32 @main() #0 {
   %a = alloca <8 x i32>, align 32
   %b = alloca <8 x i32>, align 32
   store <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32>* %a, align 32
   store <8 x i32> <i32 0, i32 2, i32 3, i32 0, i32 5, i32 0, i32 7, i32 0>, <8 x i32>* %b, align 32
   %1 = load <8 x i32>, <8 x i32>* %a, align 32
   %2 = load <8 x i32>, <8 x i32>* %b, align 32
   %3 = icmp eq <8 x i32> %1, %2
   %4 = sext <8 x i1> %3 to <8 x i32>
   %5 = bitcast <8 x i32> %4 to <8 x float>
   %res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %5)
   %6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @formatString, i32 0, i32 0), i32 %res)
   ret i32 0
}

declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) #1
declare i32 @printf(i8*, ...) #2

attributes #0 = { norecurse uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-hle,-pku,-prfchw,-rdseed,-rtm,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-hle,-pku,-prfchw,-rdseed,-rtm,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }

生成的程序集(由llc)看起来非常理想:

vmovaps  .LCPI0_0(%rip), %ymm0   # ymm0 = [1,2,3,4,5,6,7,8]
   vmovaps  %ymm0, 32(%rsp)
   vmovdqa  .LCPI0_1(%rip), %ymm0   # ymm0 = [0,2,3,0,5,0,7,0]
   vmovdqa  %ymm0, (%rsp)
   vpcmpeqd 32(%rsp), %ymm0, %ymm0
   vmovmskps   %ymm0, %esi

答案 1 :(得分:0)

我发现这种方式有效。

define i8 @main() #0 {
entry:
  %0 = icmp eq <8 x i32> <i32 90,i32 91,i32 92,i32 93, i32 94,i32 95,i32 96,i32 97>, <i32 90,i32 91,i32 92,i32 93, i32 94,i32 95,i32 96,i32 97>
  %1 = bitcast <8 x i1> %0 to <1 x i8>
  %2 = extractelement <1 x i8> %1, i32 0
  ret i8 %2
}

这是我在问题中发布的类似代码,我用&#34; echo $?&#34;检查了结果。我得到了预期的结果。