我正在编写一个涉及向量操作的llvm-ir代码。我做了一个与'icmp'指令的整数向量比较,导致bools的矢量说< 8 x i1>,我的问题是我想将这8位转换为其对应的整数值而不遍历向量(从中提取元素)矢量),我试过'bitcast< 8 x i1>到i8'似乎将矢量的第一位转换为i8,如果错了,请纠正我。有人可以建议我这样做的方法。
define i8 @main() #0 {
entry:
%A = alloca [8 x i32], align 16
%B = alloca [8 x i32], align 16
%arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* %A, i64 0, i64 0
store i32 90, i32* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds [8 x i32], [8 x i32]* %A, i64 0, i64 1
store i32 91, i32* %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds [8 x i32], [8 x i32]* %A, i64 0, i64 2
store i32 92, i32* %arrayidx2, align 8
%arrayidx3 = getelementptr inbounds [8 x i32], [8 x i32]* %A, i64 0, i64 3
store i32 93, i32* %arrayidx3, align 4
%arrayidx4 = getelementptr inbounds [8 x i32], [8 x i32]* %B, i64 0, i64 0
store i32 90, i32* %arrayidx4, align 4
%arrayidx5 = getelementptr inbounds [8 x i32], [8 x i32]* %B, i64 0, i64 1
store i32 1, i32* %arrayidx5, align 4
%arrayidx6 = getelementptr inbounds [8 x i32], [8 x i32]* %B, i64 0, i64 2
store i32 92, i32* %arrayidx6, align 8
%arrayidx7 = getelementptr inbounds [8 x i32], [8 x i32]* %B, i64 0, i64 3
store i32 93, i32* %arrayidx7, align 4
br label %vector.body
vector.body:
%0 = bitcast [8 x i32]* %A to <8 x i32>*
%1 = bitcast [8 x i32]* %B to <8 x i32>*
%2 = load <8 x i32>, <8 x i32>* %0
%3 = load <8 x i32>, <8 x i32>* %1
%4 = icmp eq <8 x i32> %2, %3
%5 = bitcast <8 x i1> %4 to i8
ret i8 %5;
}
使用'lli'运行此代码而没有任何标志。输出预计为11,但是得到1或0 非常感谢你。
答案 0 :(得分:0)
据我所知,如果不调用平台特定的内在函数,就无法做到这一点。我注意到由于无法在c ++中编写目标独立代码。
例如,下面的代码:
typedef int v8i __attribute__((vector_size(32)));
int main() {
v8i a = { 1, 2, 3, 4, 5, 6, 7, 8};
v8i b = { 0, 2, 3, 4, 5, 6, 7, 0};
v8i cmp = (a == b);
char res = *(char*)&cmp;
printf("%d\n", res);
return 0;
}
生成llvm-IR,它与你所写的(与适当的bitcast)非常接近。
不幸的是,它并没有按预期工作。
那是因为&lt; 8 x i1&gt;在处理器上不存在。例如,在x86 AVX2中,_mm256_cmpeq_epi32产生__m256i。 将其绑定到char只会占用该寄存器的前8位。
我编写了intel AVX2特定代码,并找到了相应的指令:intel intrinsic guide
所以这段代码可以满足您的需求:
#include <cstdio>
#include <cstdlib>
#include <immintrin.h>
int main() {
__m256i a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
__m256i b = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 0);
__m256i eq = _mm256_cmpeq_epi32(a, b);
int res = _mm256_movemask_ps(_mm256_castsi256_ps(eq));
printf("res = %d\n", res);
for(int i = 0; i < 8; ++i) {
printf("%d %d -> %d\n", _mm256_extract_epi32(a, i), _mm256_extract_epi32(b, i), !!((res << i) & 0x80));
}
return 0;
}
就ll代码而言,事实证明你需要一些额外的bitcast(浮动),以及对内在的调用
@ llvm.x86.avx.movmsk.ps.256
手动重写llvm-IR代码导致:
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
@formatString = private constant [4 x i8] c"%d\0A\00"
define i32 @main() #0 {
%a = alloca <8 x i32>, align 32
%b = alloca <8 x i32>, align 32
store <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32>* %a, align 32
store <8 x i32> <i32 0, i32 2, i32 3, i32 0, i32 5, i32 0, i32 7, i32 0>, <8 x i32>* %b, align 32
%1 = load <8 x i32>, <8 x i32>* %a, align 32
%2 = load <8 x i32>, <8 x i32>* %b, align 32
%3 = icmp eq <8 x i32> %1, %2
%4 = sext <8 x i1> %3 to <8 x i32>
%5 = bitcast <8 x i32> %4 to <8 x float>
%res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %5)
%6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @formatString, i32 0, i32 0), i32 %res)
ret i32 0
}
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) #1
declare i32 @printf(i8*, ...) #2
attributes #0 = { norecurse uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-hle,-pku,-prfchw,-rdseed,-rtm,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-hle,-pku,-prfchw,-rdseed,-rtm,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
生成的程序集(由llc)看起来非常理想:
vmovaps .LCPI0_0(%rip), %ymm0 # ymm0 = [1,2,3,4,5,6,7,8]
vmovaps %ymm0, 32(%rsp)
vmovdqa .LCPI0_1(%rip), %ymm0 # ymm0 = [0,2,3,0,5,0,7,0]
vmovdqa %ymm0, (%rsp)
vpcmpeqd 32(%rsp), %ymm0, %ymm0
vmovmskps %ymm0, %esi
答案 1 :(得分:0)
我发现这种方式有效。
define i8 @main() #0 {
entry:
%0 = icmp eq <8 x i32> <i32 90,i32 91,i32 92,i32 93, i32 94,i32 95,i32 96,i32 97>, <i32 90,i32 91,i32 92,i32 93, i32 94,i32 95,i32 96,i32 97>
%1 = bitcast <8 x i1> %0 to <1 x i8>
%2 = extractelement <1 x i8> %1, i32 0
ret i8 %2
}
这是我在问题中发布的类似代码,我用&#34; echo $?&#34;检查了结果。我得到了预期的结果。