[Vectorization] Mis match in code generated

Hi,

I am trying to understand LLVM vectorization implementation and was looking into both loop and SLP vectorization.

test case 1:

int foo(int *a) {
int sum = 0,i;
for(i=0; i<16; i++)
sum += a[i];
return sum;
}

This code is vectorized by loop vectorizer where we calculate scalar loop cost as 4 and vector loop cost as 2.

Since vector loop cost is less and above reduction is legal to vectorize, loop vectorizer produces vectorized code :

IR without vectorization:

target datalayout = “e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128”
target triple = “x86_64-pc-linux-gnu”

; Function Attrs: nounwind readonly
define i32 @foo(i32* nocapture readonly %a, i32 %n) #0 {
entry:
br label %for.body

for.body: ; preds = %for.body, %entry
%i.05 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%sum.04 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds i32* %a, i32 %i.05
%0 = load i32* %arrayidx, align 4, !tbaa !1
%add = add nsw i32 %0, %sum.04
%inc = add nsw i32 %i.05, 1
%exitcond = icmp eq i32 %i.05, 15
br i1 %exitcond, label %for.end, label %for.body

for.end: ; preds = %for.body
ret i32 %add
}

IR after vectorization:

target datalayout = “e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128”
target triple = “x86_64-pc-linux-gnu”

; Function Attrs: nounwind readonly
define i32 @foo(i32* nocapture readonly %a, i32 %n) #0 {
entry:
%backedge.overflow = icmp eq i32 15, -1
%overflow.check.anchor = add i32 0, 0
br i1 %backedge.overflow, label %scalar.ph, label %overflow.checked

overflow.checked: ; preds = %entry
br i1 false, label %middle.block, label %vector.ph

vector.ph: ; preds = %overflow.checked
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = extractelement <4 x i32> %induction, i32 0
%1 = getelementptr inbounds i32* %a, i32 %0
%2 = insertelement <4 x i32*> undef, i32* %1, i32 0
%3 = extractelement <4 x i32> %induction, i32 1
%4 = getelementptr inbounds i32* %a, i32 %3
%5 = insertelement <4 x i32*> %2, i32* %4, i32 1
%6 = extractelement <4 x i32> %induction, i32 2
%7 = getelementptr inbounds i32* %a, i32 %6
%8 = insertelement <4 x i32*> %5, i32* %7, i32 2
%9 = extractelement <4 x i32> %induction, i32 3
%10 = getelementptr inbounds i32* %a, i32 %9
%11 = insertelement <4 x i32*> %8, i32* %10, i32 3
%12 = getelementptr i32* %1, i32 0
%13 = bitcast i32* %12 to <4 x i32>*
%wide.load = load <4 x i32>* %13, align 4, !tbaa !1
%14 = add nsw <4 x i32> %wide.load, %vec.phi
%15 = add nsw <4 x i32> %induction, <i32 1, i32 1, i32 1, i32 1>
%16 = icmp eq <4 x i32> %induction, <i32 15, i32 15, i32 15, i32 15>
%index.next = add i32 %index, 4
%17 = icmp eq i32 %index.next, 16
br i1 %17, label %middle.block, label %vector.body, !llvm.loop !5

middle.block: ; preds = %vector.body, %overflow.checked
%resume.val = phi i32 [ 0, %overflow.checked ], [ 16, %vector.body ]
%trunc.resume.val = phi i32 [ 0, %overflow.checked ], [ 16, %vector.body ]
%rdx.vec.exit.phi = phi <4 x i32> [ zeroinitializer, %overflow.checked ], [ %14, %vector.body ]
%rdx.shuf = shufflevector <4 x i32> %rdx.vec.exit.phi, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%bin.rdx = add <4 x i32> %rdx.vec.exit.phi, %rdx.shuf
%rdx.shuf1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx2 = add <4 x i32> %bin.rdx, %rdx.shuf1
%18 = extractelement <4 x i32> %bin.rdx2, i32 0
%cmp.n = icmp eq i32 16, %resume.val
br i1 %cmp.n, label %for.end, label %scalar.ph

scalar.ph: ; preds = %middle.block, %entry
%bc.resume.val = phi i32 [ %resume.val, %middle.block ], [ 0, %entry ]
%bc.trunc.resume.val = phi i32 [ %trunc.resume.val, %middle.block ], [ 0, %entry ]
%bc.merge.rdx = phi i32 [ 0, %entry ], [ %18, %middle.block ]
br label %for.body

for.body: ; preds = %for.body, %scalar.ph
%i.05 = phi i32 [ %bc.trunc.resume.val, %scalar.ph ], [ %inc, %for.body ]
%sum.04 = phi i32 [ %bc.merge.rdx, %scalar.ph ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds i32* %a, i32 %i.05
%19 = load i32* %arrayidx, align 4, !tbaa !1
%add = add nsw i32 %19, %sum.04
%inc = add nsw i32 %i.05, 1
%exitcond = icmp eq i32 %i.05, 15
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !8

for.end: ; preds = %middle.block, %for.body
%add.lcssa = phi i32 [ %add, %for.body ], [ %18, %middle.block ]
ret i32 %add.lcssa
}

Now for test case 2, where we unroll the loop totally :

int foo(int *a) {
int sum = 0;
sum = a[0]+a[1]+a[2]+a[3]+a[4]+a[5]+a[6]+a[7]+a[8]+a[9]+a[10]+a[11]+a[12]+a[13]+a[14]+a[15];
return sum;
}

This code is not vectorized by SLP vectorization.

IR before vectorization:

test.ll

target datalayout = “e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128”
target triple = “x86_64-pc-linux-gnu”

; Function Attrs: nounwind readonly
define i32 @foo(i32* nocapture readonly %a, i32 %n) #0 {
entry:
%0 = load i32* %a, align 4, !tbaa !1
%arrayidx1 = getelementptr inbounds i32* %a, i32 1
%1 = load i32* %arrayidx1, align 4, !tbaa !1
%add = add nsw i32 %1, %0
%arrayidx2 = getelementptr inbounds i32* %a, i32 2
%2 = load i32* %arrayidx2, align 4, !tbaa !1
%add3 = add nsw i32 %add, %2
%arrayidx4 = getelementptr inbounds i32* %a, i32 3
%3 = load i32* %arrayidx4, align 4, !tbaa !1
%add5 = add nsw i32 %add3, %3
%arrayidx6 = getelementptr inbounds i32* %a, i32 4
%4 = load i32* %arrayidx6, align 4, !tbaa !1
%add7 = add nsw i32 %add5, %4
%arrayidx8 = getelementptr inbounds i32* %a, i32 5
%5 = load i32* %arrayidx8, align 4, !tbaa !1
%add9 = add nsw i32 %add7, %5
%arrayidx10 = getelementptr inbounds i32* %a, i32 6
%6 = load i32* %arrayidx10, align 4, !tbaa !1
%add11 = add nsw i32 %add9, %6
%arrayidx12 = getelementptr inbounds i32* %a, i32 7
%7 = load i32* %arrayidx12, align 4, !tbaa !1
%add13 = add nsw i32 %add11, %7
%arrayidx14 = getelementptr inbounds i32* %a, i32 8
%8 = load i32* %arrayidx14, align 4, !tbaa !1
%add15 = add nsw i32 %add13, %8
%arrayidx16 = getelementptr inbounds i32* %a, i32 9
%9 = load i32* %arrayidx16, align 4, !tbaa !1
%add17 = add nsw i32 %add15, %9
%arrayidx18 = getelementptr inbounds i32* %a, i32 10
%10 = load i32* %arrayidx18, align 4, !tbaa !1
%add19 = add nsw i32 %add17, %10
%arrayidx20 = getelementptr inbounds i32* %a, i32 11
%11 = load i32* %arrayidx20, align 4, !tbaa !1
%add21 = add nsw i32 %add19, %11
%arrayidx22 = getelementptr inbounds i32* %a, i32 12
%12 = load i32* %arrayidx22, align 4, !tbaa !1
%add23 = add nsw i32 %add21, %12
%arrayidx24 = getelementptr inbounds i32* %a, i32 13
%13 = load i32* %arrayidx24, align 4, !tbaa !1
%add25 = add nsw i32 %add23, %13
%arrayidx26 = getelementptr inbounds i32* %a, i32 14
%14 = load i32* %arrayidx26, align 4, !tbaa !1
%add27 = add nsw i32 %add25, %14
%arrayidx28 = getelementptr inbounds i32* %a, i32 15
%15 = load i32* %arrayidx28, align 4, !tbaa !1
%add29 = add nsw i32 %add27, %15
ret i32 %add29
}

$ opt -S -slp-vectorizer -slp-vectorize-hor test.ll -debug -o test2.ll

Features:+64bit,+sse2
CPU:generic

Subtarget features: SSELevel 3, 3DNowLevel 0, 64bit 1
SLP: Analyzing blocks in foo.

test2.ll (IR after SLP vectorization) :

target datalayout = “e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128”
target triple = “x86_64-pc-linux-gnu”

; Function Attrs: nounwind readonly
define i32 @foo(i32* nocapture readonly %a, i32 %n) #0 {
entry:
%0 = load i32* %a, align 4, !tbaa !1
%arrayidx1 = getelementptr inbounds i32* %a, i32 1
%1 = load i32* %arrayidx1, align 4, !tbaa !1
%add = add nsw i32 %1, %0
%arrayidx2 = getelementptr inbounds i32* %a, i32 2
%2 = load i32* %arrayidx2, align 4, !tbaa !1
%add3 = add nsw i32 %add, %2
%arrayidx4 = getelementptr inbounds i32* %a, i32 3
%3 = load i32* %arrayidx4, align 4, !tbaa !1
%add5 = add nsw i32 %add3, %3
%arrayidx6 = getelementptr inbounds i32* %a, i32 4
%4 = load i32* %arrayidx6, align 4, !tbaa !1
%add7 = add nsw i32 %add5, %4
%arrayidx8 = getelementptr inbounds i32* %a, i32 5
%5 = load i32* %arrayidx8, align 4, !tbaa !1
%add9 = add nsw i32 %add7, %5
%arrayidx10 = getelementptr inbounds i32* %a, i32 6
%6 = load i32* %arrayidx10, align 4, !tbaa !1
%add11 = add nsw i32 %add9, %6
%arrayidx12 = getelementptr inbounds i32* %a, i32 7
%7 = load i32* %arrayidx12, align 4, !tbaa !1
%add13 = add nsw i32 %add11, %7
%arrayidx14 = getelementptr inbounds i32* %a, i32 8
%8 = load i32* %arrayidx14, align 4, !tbaa !1
%add15 = add nsw i32 %add13, %8
%arrayidx16 = getelementptr inbounds i32* %a, i32 9
%9 = load i32* %arrayidx16, align 4, !tbaa !1
%add17 = add nsw i32 %add15, %9
%arrayidx18 = getelementptr inbounds i32* %a, i32 10
%10 = load i32* %arrayidx18, align 4, !tbaa !1
%add19 = add nsw i32 %add17, %10
%arrayidx20 = getelementptr inbounds i32* %a, i32 11
%11 = load i32* %arrayidx20, align 4, !tbaa !1
%add21 = add nsw i32 %add19, %11
%arrayidx22 = getelementptr inbounds i32* %a, i32 12
%12 = load i32* %arrayidx22, align 4, !tbaa !1
%add23 = add nsw i32 %add21, %12
%arrayidx24 = getelementptr inbounds i32* %a, i32 13
%13 = load i32* %arrayidx24, align 4, !tbaa !1
%add25 = add nsw i32 %add23, %13
%arrayidx26 = getelementptr inbounds i32* %a, i32 14
%14 = load i32* %arrayidx26, align 4, !tbaa !1
%add27 = add nsw i32 %add25, %14
%arrayidx28 = getelementptr inbounds i32* %a, i32 15
%15 = load i32* %arrayidx28, align 4, !tbaa !1
%add29 = add nsw i32 %add27, %15
ret i32 %add29
}

Shouldn’t the SLP vectorizer vectorize above code? Am I missing out something?

How does loop unrolling affect the final vectorization of code?

I tried to debug above IR. Since there is no store in above IR, SLP vectorization doesn’t call
vectorizeStoreChains(). In vectorizeChainsInBlock(), as there is no phi node in above IR, it
also fails to vectorize the IR.

If it is perfectly legal to vectorize above IR, are we lacking code for it right now? Or its not

possible to vectorize above code (as far as i know, it is perfectly possible to vectorize above code)?

If its possible, can someone help me/point out specifics how to start for implementing vectorization for such code?

(We do have function calls isConsecutiveAccess() to identify it. Is it a case for horizontal reduction, though the term
‘reduction’ is associated with loops/phi nodes.)

This brings to me another question - if we perform aggressive loop unrolling, we might loose opportunity for

vectorization as visible above !!

(I was looking into bug 20035, when i encountered above issue. Though both are not related.)

Your comments/suggestions are most awaited.

Hi,

I am trying to understand LLVM vectorization implementation and was looking into both loop and SLP vectorization.

test case 1:

int foo(int *a) {
int sum = 0,i;
for(i=0; i<16; i++)
sum += a[i];
return sum;
}

This code is vectorized by loop vectorizer where we calculate scalar loop cost as 4 and vector loop cost as 2.

Since vector loop cost is less and above reduction is legal to vectorize, loop vectorizer produces vectorized code :

IR without vectorization:

target datalayout = “e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128”
target triple = “x86_64-pc-linux-gnu”

; Function Attrs: nounwind readonly
define i32 @foo(i32* nocapture readonly %a, i32 %n) #0 {
entry:
br label %for.body

for.body: ; preds = %for.body, %entry
%i.05 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%sum.04 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds i32* %a, i32 %i.05
%0 = load i32* %arrayidx, align 4, !tbaa !1
%add = add nsw i32 %0, %sum.04
%inc = add nsw i32 %i.05, 1
%exitcond = icmp eq i32 %i.05, 15
br i1 %exitcond, label %for.end, label %for.body

for.end: ; preds = %for.body
ret i32 %add
}

IR after vectorization:

target datalayout = “e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128”
target triple = “x86_64-pc-linux-gnu”

; Function Attrs: nounwind readonly
define i32 @foo(i32* nocapture readonly %a, i32 %n) #0 {
entry:
%backedge.overflow = icmp eq i32 15, -1
%overflow.check.anchor = add i32 0, 0
br i1 %backedge.overflow, label %scalar.ph, label %overflow.checked

overflow.checked: ; preds = %entry
br i1 false, label %middle.block, label %vector.ph

vector.ph: ; preds = %overflow.checked
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = extractelement <4 x i32> %induction, i32 0
%1 = getelementptr inbounds i32* %a, i32 %0
%2 = insertelement <4 x i32*> undef, i32* %1, i32 0
%3 = extractelement <4 x i32> %induction, i32 1
%4 = getelementptr inbounds i32* %a, i32 %3
%5 = insertelement <4 x i32*> %2, i32* %4, i32 1
%6 = extractelement <4 x i32> %induction, i32 2
%7 = getelementptr inbounds i32* %a, i32 %6
%8 = insertelement <4 x i32*> %5, i32* %7, i32 2
%9 = extractelement <4 x i32> %induction, i32 3
%10 = getelementptr inbounds i32* %a, i32 %9
%11 = insertelement <4 x i32*> %8, i32* %10, i32 3
%12 = getelementptr i32* %1, i32 0
%13 = bitcast i32* %12 to <4 x i32>*
%wide.load = load <4 x i32>* %13, align 4, !tbaa !1
%14 = add nsw <4 x i32> %wide.load, %vec.phi
%15 = add nsw <4 x i32> %induction, <i32 1, i32 1, i32 1, i32 1>
%16 = icmp eq <4 x i32> %induction, <i32 15, i32 15, i32 15, i32 15>
%index.next = add i32 %index, 4
%17 = icmp eq i32 %index.next, 16
br i1 %17, label %middle.block, label %vector.body, !llvm.loop !5

middle.block: ; preds = %vector.body, %overflow.checked
%resume.val = phi i32 [ 0, %overflow.checked ], [ 16, %vector.body ]
%trunc.resume.val = phi i32 [ 0, %overflow.checked ], [ 16, %vector.body ]
%rdx.vec.exit.phi = phi <4 x i32> [ zeroinitializer, %overflow.checked ], [ %14, %vector.body ]
%rdx.shuf = shufflevector <4 x i32> %rdx.vec.exit.phi, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%bin.rdx = add <4 x i32> %rdx.vec.exit.phi, %rdx.shuf
%rdx.shuf1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx2 = add <4 x i32> %bin.rdx, %rdx.shuf1
%18 = extractelement <4 x i32> %bin.rdx2, i32 0
%cmp.n = icmp eq i32 16, %resume.val
br i1 %cmp.n, label %for.end, label %scalar.ph

scalar.ph: ; preds = %middle.block, %entry
%bc.resume.val = phi i32 [ %resume.val, %middle.block ], [ 0, %entry ]
%bc.trunc.resume.val = phi i32 [ %trunc.resume.val, %middle.block ], [ 0, %entry ]
%bc.merge.rdx = phi i32 [ 0, %entry ], [ %18, %middle.block ]
br label %for.body

for.body: ; preds = %for.body, %scalar.ph
%i.05 = phi i32 [ %bc.trunc.resume.val, %scalar.ph ], [ %inc, %for.body ]
%sum.04 = phi i32 [ %bc.merge.rdx, %scalar.ph ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds i32* %a, i32 %i.05
%19 = load i32* %arrayidx, align 4, !tbaa !1
%add = add nsw i32 %19, %sum.04
%inc = add nsw i32 %i.05, 1
%exitcond = icmp eq i32 %i.05, 15
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !8

for.end: ; preds = %middle.block, %for.body
%add.lcssa = phi i32 [ %add, %for.body ], [ %18, %middle.block ]
ret i32 %add.lcssa
}

Now for test case 2, where we unroll the loop totally :

int foo(int *a) {
int sum = 0;
sum = a[0]+a[1]+a[2]+a[3]+a[4]+a[5]+a[6]+a[7]+a[8]+a[9]+a[10]+a[11]+a[12]+a[13]+a[14]+a[15];
return sum;
}

This code is not vectorized by SLP vectorization.

IR before vectorization:

test.ll

target datalayout = “e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128”
target triple = “x86_64-pc-linux-gnu”

; Function Attrs: nounwind readonly
define i32 @foo(i32* nocapture readonly %a, i32 %n) #0 {
entry:
%0 = load i32* %a, align 4, !tbaa !1
%arrayidx1 = getelementptr inbounds i32* %a, i32 1
%1 = load i32* %arrayidx1, align 4, !tbaa !1
%add = add nsw i32 %1, %0
%arrayidx2 = getelementptr inbounds i32* %a, i32 2
%2 = load i32* %arrayidx2, align 4, !tbaa !1
%add3 = add nsw i32 %add, %2
%arrayidx4 = getelementptr inbounds i32* %a, i32 3
%3 = load i32* %arrayidx4, align 4, !tbaa !1
%add5 = add nsw i32 %add3, %3
%arrayidx6 = getelementptr inbounds i32* %a, i32 4
%4 = load i32* %arrayidx6, align 4, !tbaa !1
%add7 = add nsw i32 %add5, %4
%arrayidx8 = getelementptr inbounds i32* %a, i32 5
%5 = load i32* %arrayidx8, align 4, !tbaa !1
%add9 = add nsw i32 %add7, %5
%arrayidx10 = getelementptr inbounds i32* %a, i32 6
%6 = load i32* %arrayidx10, align 4, !tbaa !1
%add11 = add nsw i32 %add9, %6
%arrayidx12 = getelementptr inbounds i32* %a, i32 7
%7 = load i32* %arrayidx12, align 4, !tbaa !1
%add13 = add nsw i32 %add11, %7
%arrayidx14 = getelementptr inbounds i32* %a, i32 8
%8 = load i32* %arrayidx14, align 4, !tbaa !1
%add15 = add nsw i32 %add13, %8
%arrayidx16 = getelementptr inbounds i32* %a, i32 9
%9 = load i32* %arrayidx16, align 4, !tbaa !1
%add17 = add nsw i32 %add15, %9
%arrayidx18 = getelementptr inbounds i32* %a, i32 10
%10 = load i32* %arrayidx18, align 4, !tbaa !1
%add19 = add nsw i32 %add17, %10
%arrayidx20 = getelementptr inbounds i32* %a, i32 11
%11 = load i32* %arrayidx20, align 4, !tbaa !1
%add21 = add nsw i32 %add19, %11
%arrayidx22 = getelementptr inbounds i32* %a, i32 12
%12 = load i32* %arrayidx22, align 4, !tbaa !1
%add23 = add nsw i32 %add21, %12
%arrayidx24 = getelementptr inbounds i32* %a, i32 13
%13 = load i32* %arrayidx24, align 4, !tbaa !1
%add25 = add nsw i32 %add23, %13
%arrayidx26 = getelementptr inbounds i32* %a, i32 14
%14 = load i32* %arrayidx26, align 4, !tbaa !1
%add27 = add nsw i32 %add25, %14
%arrayidx28 = getelementptr inbounds i32* %a, i32 15
%15 = load i32* %arrayidx28, align 4, !tbaa !1
%add29 = add nsw i32 %add27, %15
ret i32 %add29
}

$ opt -S -slp-vectorizer -slp-vectorize-hor test.ll -debug -o test2.ll

Features:+64bit,+sse2
CPU:generic

Subtarget features: SSELevel 3, 3DNowLevel 0, 64bit 1
SLP: Analyzing blocks in foo.

test2.ll (IR after SLP vectorization) :

target datalayout = “e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128”
target triple = “x86_64-pc-linux-gnu”

; Function Attrs: nounwind readonly
define i32 @foo(i32* nocapture readonly %a, i32 %n) #0 {
entry:
%0 = load i32* %a, align 4, !tbaa !1
%arrayidx1 = getelementptr inbounds i32* %a, i32 1
%1 = load i32* %arrayidx1, align 4, !tbaa !1
%add = add nsw i32 %1, %0
%arrayidx2 = getelementptr inbounds i32* %a, i32 2
%2 = load i32* %arrayidx2, align 4, !tbaa !1
%add3 = add nsw i32 %add, %2
%arrayidx4 = getelementptr inbounds i32* %a, i32 3
%3 = load i32* %arrayidx4, align 4, !tbaa !1
%add5 = add nsw i32 %add3, %3
%arrayidx6 = getelementptr inbounds i32* %a, i32 4
%4 = load i32* %arrayidx6, align 4, !tbaa !1
%add7 = add nsw i32 %add5, %4
%arrayidx8 = getelementptr inbounds i32* %a, i32 5
%5 = load i32* %arrayidx8, align 4, !tbaa !1
%add9 = add nsw i32 %add7, %5
%arrayidx10 = getelementptr inbounds i32* %a, i32 6
%6 = load i32* %arrayidx10, align 4, !tbaa !1
%add11 = add nsw i32 %add9, %6
%arrayidx12 = getelementptr inbounds i32* %a, i32 7
%7 = load i32* %arrayidx12, align 4, !tbaa !1
%add13 = add nsw i32 %add11, %7
%arrayidx14 = getelementptr inbounds i32* %a, i32 8
%8 = load i32* %arrayidx14, align 4, !tbaa !1
%add15 = add nsw i32 %add13, %8
%arrayidx16 = getelementptr inbounds i32* %a, i32 9
%9 = load i32* %arrayidx16, align 4, !tbaa !1
%add17 = add nsw i32 %add15, %9
%arrayidx18 = getelementptr inbounds i32* %a, i32 10
%10 = load i32* %arrayidx18, align 4, !tbaa !1
%add19 = add nsw i32 %add17, %10
%arrayidx20 = getelementptr inbounds i32* %a, i32 11
%11 = load i32* %arrayidx20, align 4, !tbaa !1
%add21 = add nsw i32 %add19, %11
%arrayidx22 = getelementptr inbounds i32* %a, i32 12
%12 = load i32* %arrayidx22, align 4, !tbaa !1
%add23 = add nsw i32 %add21, %12
%arrayidx24 = getelementptr inbounds i32* %a, i32 13
%13 = load i32* %arrayidx24, align 4, !tbaa !1
%add25 = add nsw i32 %add23, %13
%arrayidx26 = getelementptr inbounds i32* %a, i32 14
%14 = load i32* %arrayidx26, align 4, !tbaa !1
%add27 = add nsw i32 %add25, %14
%arrayidx28 = getelementptr inbounds i32* %a, i32 15
%15 = load i32* %arrayidx28, align 4, !tbaa !1
%add29 = add nsw i32 %add27, %15
ret i32 %add29
}

Shouldn’t the SLP vectorizer vectorize above code? Am I missing out something?

How does loop unrolling affect the final vectorization of code?

Hi Suyog,

Thank you for taking the time to investigate the performance of the vectorizers. You are correct that the SLP-vectorizer should be able to do something about the second case (the unrolled one). Please look into line 2861 in the SLP-vectorizer. This is where we try to analyze reductions and vectorize them. At the moment we don’t have support for a flat yet large reduction that is the result of unrolled loops because the current algorithm tries to construct trees that start at the chains, and it does not support joining multiple trees.

http://llvm.org/docs/doxygen/html/SLPVectorizer_8cpp_source.html#l02861

Thanks,
Nadav

Hi Nadav,

Thanks for the quick reply !!

Ok, so as of now we are lacking capability to handle flat large reductions.

I did go through function vectorizeChainsInBlock() (line number 2862). In this function,

we try to vectorize if we have phi nodes in the IR (several if’s check for phi nodes) i.e we try to

construct tree that starts at chains.

Any pointers on how to join multiple trees? I will also try to dig more into it.

Comments/Suggestions are most welcomed !!

There is experimental code to handle horizontal reductions hidden behind a flag “slp-vectorize-hor” you could try to enable that. It won’t currently work for your example though because we only start to look for reductions at phis and stores “slp-vectorize-hor-stores”.

You could start off by rewriting your example with a store and see whether the tree match works after using both flags.

Hi Arnold,

Thanks for your reply.

I tried test case as suggested by you.

void foo(int *a, int *sum) {
*sum = a[0]+a[1]+a[2]+a[3]+a[4]+a[5]+a[6]+a[7]+a[8]+a[9]+a[10]+a[11]+a[12]+a[13]+a[14]+a[15];
}

so that it has a ‘store’ in its IR.

IR before vectorization :

target datalayout = “e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128”
target triple = “x86_64-pc-linux-gnu”

; Function Attrs: nounwind
define void @foo(i32* nocapture readonly %a, i32* nocapture %sum) #0 {
entry:
%0 = load i32* %a, align 4, !tbaa !1
%arrayidx1 = getelementptr inbounds i32* %a, i32 1
%1 = load i32* %arrayidx1, align 4, !tbaa !1
%add = add nsw i32 %1, %0
%arrayidx2 = getelementptr inbounds i32* %a, i32 2
%2 = load i32* %arrayidx2, align 4, !tbaa !1
%add3 = add nsw i32 %add, %2
%arrayidx4 = getelementptr inbounds i32* %a, i32 3
%3 = load i32* %arrayidx4, align 4, !tbaa !1
%add5 = add nsw i32 %add3, %3
%arrayidx6 = getelementptr inbounds i32* %a, i32 4
%4 = load i32* %arrayidx6, align 4, !tbaa !1
%add7 = add nsw i32 %add5, %4
%arrayidx8 = getelementptr inbounds i32* %a, i32 5
%5 = load i32* %arrayidx8, align 4, !tbaa !1
%add9 = add nsw i32 %add7, %5
%arrayidx10 = getelementptr inbounds i32* %a, i32 6
%6 = load i32* %arrayidx10, align 4, !tbaa !1
%add11 = add nsw i32 %add9, %6
%arrayidx12 = getelementptr inbounds i32* %a, i32 7
%7 = load i32* %arrayidx12, align 4, !tbaa !1
%add13 = add nsw i32 %add11, %7
%arrayidx14 = getelementptr inbounds i32* %a, i32 8
%8 = load i32* %arrayidx14, align 4, !tbaa !1
%add15 = add nsw i32 %add13, %8
%arrayidx16 = getelementptr inbounds i32* %a, i32 9
%9 = load i32* %arrayidx16, align 4, !tbaa !1
%add17 = add nsw i32 %add15, %9
%arrayidx18 = getelementptr inbounds i32* %a, i32 10
%10 = load i32* %arrayidx18, align 4, !tbaa !1
%add19 = add nsw i32 %add17, %10
%arrayidx20 = getelementptr inbounds i32* %a, i32 11
%11 = load i32* %arrayidx20, align 4, !tbaa !1
%add21 = add nsw i32 %add19, %11
%arrayidx22 = getelementptr inbounds i32* %a, i32 12
%12 = load i32* %arrayidx22, align 4, !tbaa !1
%add23 = add nsw i32 %add21, %12
%arrayidx24 = getelementptr inbounds i32* %a, i32 13
%13 = load i32* %arrayidx24, align 4, !tbaa !1
%add25 = add nsw i32 %add23, %13
%arrayidx26 = getelementptr inbounds i32* %a, i32 14
%14 = load i32* %arrayidx26, align 4, !tbaa !1
%add27 = add nsw i32 %add25, %14
%arrayidx28 = getelementptr inbounds i32* %a, i32 15
%15 = load i32* %arrayidx28, align 4, !tbaa !1
%add29 = add nsw i32 %add27, %15
store i32 %add29, i32* %sum, align 4, !tbaa !1
ret void
}

IR after SLP vectorization with appropriate flags :

$ opt -S -slp-vectorizer -slp-vectorize-hor=1 -slp-vectorize-hor-store=1 test.ll -debug

(I hope i am passing the args correctly to opt)

Subtarget features: SSELevel 3, 3DNowLevel 0, 64bit 1
SLP: Analyzing blocks in foo.
SLP: Found 1 stores to vectorize.
SLP: Vectorizing a list of length = 2.
; ModuleID = ‘test.ll’
target datalayout = “e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128”
target triple = “x86_64-pc-linux-gnu”

; Function Attrs: nounwind
define void @foo(i32* nocapture readonly %a, i32* nocapture %sum) #0 {
entry:
%0 = load i32* %a, align 4, !tbaa !1
%arrayidx1 = getelementptr inbounds i32* %a, i32 1
%1 = load i32* %arrayidx1, align 4, !tbaa !1
%add = add nsw i32 %1, %0
%arrayidx2 = getelementptr inbounds i32* %a, i32 2
%2 = load i32* %arrayidx2, align 4, !tbaa !1
%add3 = add nsw i32 %add, %2
%arrayidx4 = getelementptr inbounds i32* %a, i32 3
%3 = load i32* %arrayidx4, align 4, !tbaa !1
%add5 = add nsw i32 %add3, %3
%arrayidx6 = getelementptr inbounds i32* %a, i32 4
%4 = load i32* %arrayidx6, align 4, !tbaa !1
%add7 = add nsw i32 %add5, %4
%arrayidx8 = getelementptr inbounds i32* %a, i32 5
%5 = load i32* %arrayidx8, align 4, !tbaa !1
%add9 = add nsw i32 %add7, %5
%arrayidx10 = getelementptr inbounds i32* %a, i32 6
%6 = load i32* %arrayidx10, align 4, !tbaa !1
%add11 = add nsw i32 %add9, %6
%arrayidx12 = getelementptr inbounds i32* %a, i32 7
%7 = load i32* %arrayidx12, align 4, !tbaa !1
%add13 = add nsw i32 %add11, %7
%arrayidx14 = getelementptr inbounds i32* %a, i32 8
%8 = load i32* %arrayidx14, align 4, !tbaa !1
%add15 = add nsw i32 %add13, %8
%arrayidx16 = getelementptr inbounds i32* %a, i32 9
%9 = load i32* %arrayidx16, align 4, !tbaa !1
%add17 = add nsw i32 %add15, %9
%arrayidx18 = getelementptr inbounds i32* %a, i32 10
%10 = load i32* %arrayidx18, align 4, !tbaa !1
%add19 = add nsw i32 %add17, %10
%arrayidx20 = getelementptr inbounds i32* %a, i32 11
%11 = load i32* %arrayidx20, align 4, !tbaa !1
%add21 = add nsw i32 %add19, %11
%arrayidx22 = getelementptr inbounds i32* %a, i32 12
%12 = load i32* %arrayidx22, align 4, !tbaa !1
%add23 = add nsw i32 %add21, %12
%arrayidx24 = getelementptr inbounds i32* %a, i32 13
%13 = load i32* %arrayidx24, align 4, !tbaa !1
%add25 = add nsw i32 %add23, %13
%arrayidx26 = getelementptr inbounds i32* %a, i32 14
%14 = load i32* %arrayidx26, align 4, !tbaa !1
%add27 = add nsw i32 %add25, %14
%arrayidx28 = getelementptr inbounds i32* %a, i32 15
%15 = load i32* %arrayidx28, align 4, !tbaa !1
%add29 = add nsw i32 %add27, %15
store i32 %add29, i32* %sum, align 4, !tbaa !1
ret void
}

As observed above, SLP vectorization doesn’t vectorize this code.

I tried debugging, just mentioning what i found -
the code hits vectorizeChainsInBlock() → if(ShouldStartVectorizeHorAtStore) →
tryToVectorize() → tryToVectorizePair() → tryToVectorizeList() - inside this function, it iterates in the

for loop for checking if scalar instructions are of same type. It exits for 2nd iteration in the loop as

OpCode does not match , one opcode is ‘add’ and second is ‘load’. So it exits from function
‘tryToVectorizeList()’, and comes back to function ‘tryToVectorize()’, but fails to satisfy any of the

further ‘if’ in that function and returns back without vectorizing any of the code.

I will try to dig more into it and try to understand where the code can be implemented.

Comments/suggestions/explanations are always welcomed !!

Hi Arnold, Nadav, Hal,
all,

Restarting this thread with few more observations on horizontal reduction store
(after using flags - slp-vectorize-hor-store and -slp-vectorize-hor, thanks Arnold for pointing out earlier).

The SLP vectorizer does vectorize straight line code :

(this code may result as part of loop unrolling as stated earlier in the thread.
Though for loop vectorization , trip count should be atleast 16, taking smaller
code here to put some findings.)

float a[4], sum;

void foo() {

int i;

for(i=0;i<4;i++);

sum += a[i];
}

after loop unrolling -

float a[4], sum;
void foo() {
sum = a[0]+a[1]+a[2]+a[3];
}

This code gets vectorized depending on how we organize the code, and subsequently

how the expression tree is formed.

case 1:

float a[4], sum;
void foo() {
sum = a[0]+a[1]+a[2]+a[3];
}

The expression tree for this will be

a[0] a[1]

\ /
\ /

  • a[2]

\ /
\ /

  • a[3]

\ /
\ /

Hi Suyog,

the horizontal reduction code needs work. It was written long ago and was never enabled so it has bit rotted somewhat and has lurking bugs.
For example, it could not handle reduction value nodes that were not binary operations.

It could vectorize

(+ (* x y) (*z a) (*b c) (*d e))

but not

(+ (load a) (load b) (load c) (load d))

With the attached patches:
0001-Test-hor-redux.patch
0002-Print-out-tree.patch

We would attempt to vectorize the tree stemming from your example:

float a[4], sum;
void foo() {
sum = a[0]+a[1]+a[2]+a[3];
}

but because we happen to end up with the vector tuple (a[1], a[0],a[2],a[3)) in need of gathering the existing code fails. Note, that I used fast-math because the horizontal reduction code assumes it can reassociate freely.

clang -O3 test_hor_redux.c -emit-llvm -S -o - -mllvm -slp-vectorize-hor-store -debug-only=SLP -ffast-math -mllvm -debug-only=SLP

...

Reduced val: %1 = load float* getelementptr inbounds ([4 x float]* @a, i64 0, i64 1), align 4, !tbaa !1
Reduced val: %0 = load float* getelementptr inbounds ([4 x float]* @a, i64 0, i64 0), align 16, !tbaa !1
Reduced val: %2 = load float* getelementptr inbounds ([4 x float]* @a, i64 0, i64 2), align 8, !tbaa !1
Reduced val: %3 = load float* getelementptr inbounds ([4 x float]* @a, i64 0, i64 3), align 4, !tbaa !1
Reduction ops: %add = fadd fast float %1, %0
Reduction ops: %add1 = fadd fast float %add, %2
Reduction ops: %add2 = fadd fast float %add1, %3
Assertion failed: (VectorizedValue && "Need to have a vectorized tree node"), function emitReduction, file ../lib/Transforms/Vectorize/SLPVectorizer.cpp, line 3518.

The reduction looks like

(((a[1] + a[0]) + a[2]) + a[3])

So we build a tree entry (load a[1], load a[0], load a[2], load a[3]) and end up with a vectorized tree entry of one who needs gathering.

The code does not handle this case correctly (in normal slp mode there is no point in "vectorizing" a single entry tree that needs gathering).
The patch 0003-We-don-t-correctly-vectorize-single-entry-gather-nod.patch fixes this by disabling vectorization in such a case but then we don't vectorize.
As you can see this needs more work.

* We could reassociate reductions like the one above.

(((a[1] + a[0]) + a[2]) + a[3])

=>

(((a[0] + a[1]) + a[2]) + a[3])

Then the reduction tree builder code would not have to be clever. That will only work if your initial reduction values are loads. If the loads are hidden behind a deeper tree that won't help. In such a case the next solution would not be so bad though.

* We could teach the tree code to do a vector load + permute operation if applicable. This will no generate the best code though. But for deeper trees the overhead of the permute probably does not matter much.

Best,
Arnold

0001-Test-hor-redux.patch (2.97 KB)

0002-Print-out-tree.patch (1.05 KB)

0003-We-don-t-correctly-vectorize-single-entry-gather-nod.patch (1.05 KB)

Hi Suyog,

Thanks for looking at this. This has recently got itself onto my TODO list too.

I am not sure how much all this will improve the code quality for horizontal reduction
(donno how frequently such pattern of horizontal reduction from same array occurs in real world/SPECS).

Actually the main loop of 470.lbm can be SLP vectorized like this. We have three parts to it: A fully unrolled reduction, a scatter/gather part (not really vectorizable) and a contiguous-load-scatter-store part which should be vectorized even when scatter stores are not available in HW.

Cheers,

James

Hi Arnold, James

Thanks for getting back on this. Arnold i will look into patches provided by you
and get back on it. Thanks for it.

Adding few more observations :

The expression tree is like

sum = (a[0] + a[2]) + (a[1] + a[3]); → which is vectorizable with current code.

If the array a if of type float/double then there is no disturbance/canonicalization
(precision issues??) in formed tree and we get a vectorized code.

However, if the array a is of type int, then re-associate pass (runs in O1)
re-organizes this into

sum = (((a[0] + a[2]) + a[1]) + a[3]).

This results in loss of vectorization opportunity as we do not vectorize this type
of tree in current code stated earlier in the thread.

Also, we do not build vectorization tree on return in current code (PR20035),
which does not vectorizes code like this :

return a[0] + a[1] + a[2] + a[3];

I will try to work on vectorization on return statement. Inputs are always welcomed.

Regards,
Suyog

Hi Arnold, James

Thanks for getting back on this. Arnold i will look into patches provided by you
and get back on it. Thanks for it.

Note, that they were just quickly put together to support building a a horizontal tree for your case. They definitely need testing and ironing out bugs.

Adding few more observations :

The expression tree is like

sum = (a[0] + a[2]) + (a[1] + a[3]); → which is vectorizable with current code.

If the array a if of type float/double then there is no disturbance/canonicalization
(precision issues??) in formed tree and we get a vectorized code.

You can’t reassociate floating point without enabling fast-math.

We get vectorized code but it is not ideal for float. We should be doing <4 x float> operations.

However, if the array a is of type int, then re-associate pass (runs in O1)
re-organizes this into

sum = (((a[0] + a[2]) + a[1]) + a[3]).

Right, the current code trys to build a vectorization tree at an add with the add’s operands as the root, which will not work well for a reduction like (+ ( + (+ a[0] a[1]) a[2]) a[3]),

Enabling recognizing horizontal reductions should recover this and improve performance if your vector len > 2 once we have solved the sequential load issue.

This results in loss of vectorization opportunity as we do not vectorize this type
of tree in current code stated earlier in the thread.

Also, we do not build vectorization tree on return in current code (PR20035),
which does not vectorizes code like this :

return a[0] + a[1] + a[2] + a[3];

I will try to work on vectorization on return statement. Inputs are always welcomed.

I thought we had added ReturnInst and CallInst operands as starting points of vectorization trees, … it seems not.

Thanks,
Arnold