Background
The current SLP vectorizer currently vectorizes scalar instructions to vector instructions. For instance, the following code:
define void @test1(ptr %in0, ptr %out0) {
entry:
%in1 = getelementptr inbounds i32, ptr %in0, i64 1
%in2 = getelementptr inbounds i32, ptr %in0, i64 2
%in3 = getelementptr inbounds i32, ptr %in0, i64 3
%v0 = load i32, ptr %in0, align 4
%v1 = load i32, ptr %in1, align 4
%v2 = load i32, ptr %in2, align 4
%v3 = load i32, ptr %in3, align 4
%out1 = getelementptr inbounds i32, ptr %out0, i64 1
%out2 = getelementptr inbounds i32, ptr %out0, i64 2
%out3 = getelementptr inbounds i32, ptr %out0, i64 3
store i32 %v0, ptr %out0, align 4
store i32 %v1, ptr %out1, align 4
store i32 %v2, ptr %out2, align 4
store i32 %v3, ptr %out3, align 4
ret void
}
is vectorized into:
define void @test1(ptr %in0, ptr %out0) {
entry:
%0 = load <4 x i32>, ptr %in0, align 4
store <4 x i32> %0, ptr %out0, align 4
ret void
}
However, in some cases, the input may already be vectorized using small vector type, like in the following example:
define void @test2(ptr %in0, ptr %out0) {
entry:
%in1 = getelementptr inbounds <2 x i32>, ptr %in0, i64 1
%in2 = getelementptr inbounds <2 x i32>, ptr %in0, i64 2
%in3 = getelementptr inbounds <2 x i32>, ptr %in0, i64 3
%v0 = load <2 x i32>, ptr %in0, align 4
%v1 = load <2 x i32>, ptr %in1, align 4
%v2 = load <2 x i32>, ptr %in2, align 4
%v3 = load <2 x i32>, ptr %in3, align 4
%out1 = getelementptr inbounds <2 x i32>, ptr %out0, i64 1
%out2 = getelementptr inbounds <2 x i32>, ptr %out0, i64 2
%out3 = getelementptr inbounds <2 x i32>, ptr %out0, i64 3
store <2 x i32> %v0, ptr %out0, align 4
store <2 x i32> %v1, ptr %out1, align 4
store <2 x i32> %v2, ptr %out2, align 4
store <2 x i32> %v3, ptr %out3, align 4
ret void
}
The SLP vectorizer currently cannot optimize test2
into:
define void @test2(ptr %in0, ptr %out0) {
entry:
%0 = load <8 x i32>, ptr %in0, align 4
store <8 x i32> %0, ptr %out0, align 4
ret void
}
even though test2
is relatively simple.The only difference is that test1
uses i32
while test2
uses <2 x i32>
. This proposal aims to extend the SLP vectorizer to revectorize vector instructions.
Improve SIMD programs
SLP ignores instructions if the instructions have fixed vector types. This proposal aims to enhance programs written with SIMD intrinsics using short vector register lengths, but intended to run on hardware with longer vector register lengths. For instance, a program utilizing SSE intrinsics but executed on AVX2-supported hardware. Another example is a program written in VLS style but executed on VLA hardware.
Improve shufflevector
The current SLP vectorizer does not support shufflevector
operations, as shown in the following example:
define void @zext_u8x16_to_u16x16(ptr %in, ptr %out) {
entry:
%0 = load <16 x i8>, ptr %in, align 1
%shuffle.i = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%shuffle.i8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%vmovl.i = zext <8 x i8> %shuffle.i to <8 x i16>
%vmovl.i9 = zext <8 x i8> %shuffle.i8 to <8 x i16>
%add.ptr5 = getelementptr inbounds i16, ptr %out, i64 4
store <8 x i16> %vmovl.i, ptr %out, align 2
store <8 x i16> %vmovl.i9, ptr %add.ptr5, align 2
ret void
}
This proposal addresses this issue and enables the vectorization of shufflevector
operations. Therefore, zext_u8x16_to_u16x16
will be simplified as follows:
define void @zext_u8x16_to_u16x16(ptr %in, ptr %out) {
entry:
%0 = load <16 x i8>, ptr %in, align 1
%vmovl.i = zext <16 x i8> %0 to <16 x i16>
store <16 x i16> %vmovl.i, ptr %out, align 2
ret void
}
Improve SLP
This proposal can also improve what SLP cannot do right now. For example, in llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
, the simple_select_no_users
is optimized into:
define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
%1 = shufflevector <4 x i32> %c, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
%2 = icmp ne <2 x i32> %1, zeroinitializer
%3 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 1>
%4 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 1>
%5 = select <2 x i1> %2, <2 x float> %3, <2 x float> %4
%6 = shufflevector <4 x i32> %c, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
%7 = icmp ne <2 x i32> %6, zeroinitializer
%8 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 2, i32 3>
%9 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 2, i32 3>
%10 = select <2 x i1> %7, <2 x float> %8, <2 x float> %9
%11 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%12 = shufflevector <2 x float> %10, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
%rd1 = shufflevector <4 x float> %12, <4 x float> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
ret <4 x float> %rd1
}
The SLP vectorizer currently only optimizes scalars into vectors. However, %2
and %7
in the above example could undergo further optimizations if the SLP vectorizer supported fixed vector types. Thus, simple_select_no_users
could be optimized into:
define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
%1 = shufflevector <4 x i32> %c, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
%2 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 1>
%3 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 1>
%4 = shufflevector <4 x i32> %c, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
%5 = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> %4, i64 0)
%6 = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> %5, <2 x i32> %1, i64 2)
%7 = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> zeroinitializer, i64 0)
%8 = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> %7, <2 x i32> zeroinitializer, i64 2)
%9 = icmp ne <4 x i32> %6, %8
%10 = call <2 x i1> @llvm.vector.extract.v2i1.v4i1(<4 x i1> %9, i64 2)
%11 = select <2 x i1> %10, <2 x float> %2, <2 x float> %3
%12 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 2, i32 3>
%13 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 2, i32 3>
%14 = call <2 x i1> @llvm.vector.extract.v2i1.v4i1(<4 x i1> %9, i64 0)
%15 = select <2 x i1> %14, <2 x float> %12, <2 x float> %13
%16 = shufflevector <2 x float> %11, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%17 = shufflevector <2 x float> %15, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
%rd1 = shufflevector <4 x float> %17, <4 x float> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
ret <4 x float> %rd1
}
Another example is
define i1 @test3(i32 %or0, i32 %or1, i32 %or2, i32 %or3, i32 %c, i32 %d, i32 %e) {
entry:
%a = or i32 %or0, %or1
%b = or i32 %or2, %or3
%81 = icmp slt i32 %c, 0
%83 = add i32 %a, %d
%84 = sub i32 %a, %d
%85 = icmp slt i32 %83, %a
%86 = icmp sgt i32 %84, %a
%89 = add i32 %b, %e
%90 = icmp slt i32 %89, %b
ret i1 %90
}
SLP can make the code into
define i1 @test3(i32 %or0, i32 %or1, i32 %or2, i32 %or3, i32 %c, i32 %d, i32 %e) {
entry:
%0 = insertelement <2 x i32> poison, i32 %or2, i32 0
%1 = insertelement <2 x i32> %0, i32 %or0, i32 1
%2 = insertelement <2 x i32> poison, i32 %or3, i32 0
%3 = insertelement <2 x i32> %2, i32 %or1, i32 1
%4 = or <2 x i32> %1, %3
%5 = extractelement <2 x i32> %4, i32 1
%6 = sub i32 %5, %d
%7 = insertelement <2 x i32> %4, i32 %c, i32 0
%8 = insertelement <2 x i32> <i32 0, i32 poison>, i32 %6, i32 1
%9 = icmp slt <2 x i32> %7, %8
%10 = insertelement <2 x i32> poison, i32 %e, i32 0
%11 = insertelement <2 x i32> %10, i32 %d, i32 1
%12 = add <2 x i32> %4, %11
%13 = icmp slt <2 x i32> %12, %4
%14 = extractelement <2 x i1> %13, i32 0
ret i1 %14
}
But if the SLP can support fixed vector type, the test3
can be optimized to
define i1 @test3(i32 %or0, i32 %or1, i32 %or2, i32 %or3, i32 %c, i32 %d, i32 %e) {
entry:
%0 = insertelement <2 x i32> poison, i32 %or2, i32 0
%1 = insertelement <2 x i32> %0, i32 %or0, i32 1
%2 = insertelement <2 x i32> poison, i32 %or3, i32 0
%3 = insertelement <2 x i32> %2, i32 %or1, i32 1
%4 = or <2 x i32> %1, %3
%5 = extractelement <2 x i32> %4, i32 1
%6 = sub i32 %5, %d
%7 = insertelement <2 x i32> %4, i32 %c, i32 0
%8 = insertelement <2 x i32> <i32 0, i32 poison>, i32 %6, i32 1
%9 = insertelement <2 x i32> poison, i32 %e, i32 0
%10 = insertelement <2 x i32> %9, i32 %d, i32 1
%11 = add <2 x i32> %4, %10
%12 = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> %11, i64 0)
%13 = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> %12, <2 x i32> %7, i64 2)
%14 = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> %4, i64 0)
%15 = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> %14, <2 x i32> %8, i64 2)
%16 = icmp slt <4 x i32> %13, %15
%17 = call <2 x i1> @llvm.vector.extract.v2i1.v4i1(<4 x i1> %16, i64 0)
%18 = extractelement <2 x i1> %17, i32 0
ret i1 %18
}
Improve LTO
In LTO, SLP may do the optimizations too early. If SLP can revectorize vector instructions, the previous optimization result can be optimized further.
Compilation time increment
We analyzed the compilation time for SPEC2006 and SPEC2017. In SPEC2006, the total compilation time increases by 0.02%. When considering only SLP, the increase is 0.3%. In SPEC2017, the total compilation time increases by 0.23%. When considering only SLP, the increase is 2.25%.
Reference