Hi. I’m new to SLPVectorizerPASS, and try to get an overview of how SLPVectorizer works.
I’m now wondering SLPVectorizerPASS do somehow dce after runImpl?
(I’m also new to the passmanager related APIs so, it might not be only for SLP)
on my local build for llvm release 16.x,
./build/bin/opt -S -mtriple=x86_64-unknown -mcpu=corei7-avx -mattr=-prefer-128-bit -passes=slp-vectorizer < ./test.ll
for the following IR(the first test in SLP’s arith-add.ll test)
@a64 = common global [8 x i64] zeroinitializer, align 64
@b64 = common global [8 x i64] zeroinitializer, align 64
@c64 = common global [8 x i64] zeroinitializer, align 64
define void @add_v8i64() {
%a0 = load i64, ptr @a64, align 8
%a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
%a2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
%a3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
%a4 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
%a5 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
%a6 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
%a7 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
%b0 = load i64, ptr @b64, align 8
%b1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
%b2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
%b3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
%b4 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
%b5 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
%b6 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
%b7 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
%r0 = add i64 %a0, %b0
%r1 = add i64 %a1, %b1
%r2 = add i64 %a2, %b2
%r3 = add i64 %a3, %b3
%r4 = add i64 %a4, %b4
%r5 = add i64 %a5, %b5
%r6 = add i64 %a6, %b6
%r7 = add i64 %a7, %b7
store i64 %r0, ptr @c64, align 8
store i64 %r1, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
store i64 %r2, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
store i64 %r3, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
store i64 %r4, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
store i64 %r5, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
store i64 %r6, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
store i64 %r7, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
ret void
}
outputs like
; ModuleID = '<stdin>'
source_filename = "<stdin>"
target triple = "x86_64-unknown"
@a64 = common global [8 x i64] zeroinitializer, align 64
@b64 = common global [8 x i64] zeroinitializer, align 64
@c64 = common global [8 x i64] zeroinitializer, align 64
define void @add_v8i64() #0 {
%1 = load <4 x i64>, ptr @a64, align 8
%2 = load <4 x i64>, ptr @b64, align 8
%3 = add <4 x i64> %1, %2
store <4 x i64> %3, ptr @c64, align 8
%4 = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
%5 = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
%6 = add <4 x i64> %4, %5
store <4 x i64> %6, ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
ret void
}
attributes #0 = { "target-cpu"="corei7-avx" "target-features"="-prefer-128-bit" }
but, my debug print using Function::print on the last(seemingly on runImpl’s end?) phase for SLP shows different results contains original registers like
...
%a3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
%1 = load <4 x i64>, ptr @a64, align 8
%a2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
%a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
%a0 = load i64, ptr @a64, align 8
%b3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
%2 = load <4 x i64>, ptr @b64, align 8
%b2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
%b1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
%b0 = load i64, ptr @b64, align 8
%r3 = add i64 %a3, %b3
...
where are these redundant registers eliminated? Or is it a general process to the middle-end pass post-processing on somewhere? Thanks in advance!