loop vectorizer says Bad stride

Verifying function
running passes ...
LV: Checking a loop in "bar"
LV: Found a loop: L0
LV: Found an induction variable.
LV: We need to do 0 pointer comparisons.
LV: Checking memory dependencies
LV: Bad stride - Not an AddRecExpr pointer %13 = getelementptr float* %arg2, i32 %1 SCEV: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + %arg2)
LV: Src Scev: {((4 * (sext i32 %arg0 to i64)) + %arg2),+,4}<%L0>Sink Scev: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + %arg2)(Induction step: 1)
LV: Distance for store float %11, float* %12 to store float %10, float* %13: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + {(-4 * (sext i32 %arg0 to i64)),+,-4}<%L0>)
Non-consecutive pointer access
LV: We don't need a runtime memory check.
LV: Can't vectorize due to memory conflicts
LV: Not vectorizing.

Here the input IR:

define void @bar(i32 %arg0, i32 %arg1, float* noalias %arg2, float* noalias %arg3, float* noalias %arg4) {
entrypoint:
   br label %L0

L0: ; preds = %L0, %entrypoint
   %0 = phi i32 [ %16, %L0 ], [ %arg0, %entrypoint ]
   %1 = add nsw i32 %0, 256
   %2 = sext i32 %0 to i64
   %3 = getelementptr float* %arg3, i64 %2
   %4 = load float* %3, align 4
   %5 = sext i32 %1 to i64
   %6 = getelementptr float* %arg3, i64 %5
   %7 = load float* %6, align 4
   %8 = getelementptr float* %arg4, i64 %2
   %9 = load float* %8, align 4
   %10 = getelementptr float* %arg4, i64 %5
   %11 = load float* %10, align 4
   %12 = fadd float %11, %7
   %13 = fadd float %9, %4
   %14 = getelementptr float* %arg2, i64 %2
   store float %13, float* %14, align 4
   %15 = getelementptr float* %arg2, i64 %5
   store float %12, float* %15, align 4
   %16 = add nsw i32 %0, 1
   %17 = icmp slt i32 %16, %arg1
   br i1 %17, label %L0, label %L1

L1: ; preds = %L0
   ret void
}

This function is IMO equivalent to

void main(int start, int end, float * restrict c, float * restrict a, float * restrict b)
{
   const int width = 256;
   for (int i = start ; i < end ; ++i ) {
     c[ i ] = a[ i ] + b[ i ];
     c[ width + i ] = a[ width + i ] + b[ width + i ];
   }
}

With this version, the vectorizer doesnt complain about a bad stride and can parallelize the loop.

Here the output from "clang -emit-llvm -S loop.c" which can be parallelized:

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: nounwind uwtable
define void @bar(float* noalias %c, float* noalias %a, float* noalias %b, i32 %start, i32 %end) #0 {
entry:
   %c.addr = alloca float*, align 8
   %a.addr = alloca float*, align 8
   %b.addr = alloca float*, align 8
   %start.addr = alloca i32, align 4
   %end.addr = alloca i32, align 4
   %width = alloca i32, align 4
   %i = alloca i32, align 4
   store float* %c, float** %c.addr, align 8
   store float* %a, float** %a.addr, align 8
   store float* %b, float** %b.addr, align 8
   store i32 %start, i32* %start.addr, align 4
   store i32 %end, i32* %end.addr, align 4
   store i32 256, i32* %width, align 4
   %0 = load i32* %start.addr, align 4
   store i32 %0, i32* %i, align 4
   br label %for.cond

for.cond: ; preds = %for.inc, %entry
   %1 = load i32* %i, align 4
   %2 = load i32* %end.addr, align 4
   %cmp = icmp slt i32 %1, %2
   br i1 %cmp, label %for.body, label %for.end

for.body: ; preds = %for.cond
   %3 = load i32* %i, align 4
   %idxprom = sext i32 %3 to i64
   %4 = load float** %a.addr, align 8
   %arrayidx = getelementptr inbounds float* %4, i64 %idxprom
   %5 = load float* %arrayidx, align 4
   %6 = load i32* %i, align 4
   %idxprom1 = sext i32 %6 to i64
   %7 = load float** %b.addr, align 8
   %arrayidx2 = getelementptr inbounds float* %7, i64 %idxprom1
   %8 = load float* %arrayidx2, align 4
   %add = fadd float %5, %8
   %9 = load i32* %i, align 4
   %idxprom3 = sext i32 %9 to i64
   %10 = load float** %c.addr, align 8
   %arrayidx4 = getelementptr inbounds float* %10, i64 %idxprom3
   store float %add, float* %arrayidx4, align 4
   %11 = load i32* %i, align 4
   %add5 = add nsw i32 256, %11
   %idxprom6 = sext i32 %add5 to i64
   %12 = load float** %a.addr, align 8
   %arrayidx7 = getelementptr inbounds float* %12, i64 %idxprom6
   %13 = load float* %arrayidx7, align 4
   %14 = load i32* %i, align 4
   %add8 = add nsw i32 256, %14
   %idxprom9 = sext i32 %add8 to i64
   %15 = load float** %b.addr, align 8
   %arrayidx10 = getelementptr inbounds float* %15, i64 %idxprom9
   %16 = load float* %arrayidx10, align 4
   %add11 = fadd float %13, %16
   %17 = load i32* %i, align 4
   %add12 = add nsw i32 256, %17
   %idxprom13 = sext i32 %add12 to i64
   %18 = load float** %c.addr, align 8
   %arrayidx14 = getelementptr inbounds float* %18, i64 %idxprom13
   store float %add11, float* %arrayidx14, align 4
   br label %for.inc

for.inc: ; preds = %for.body
   %19 = load i32* %i, align 4
   %inc = add nsw i32 %19, 1
   store i32 %inc, i32* %i, align 4
   br label %for.cond

for.end: ; preds = %for.cond
   ret void
}

attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.ident = !{!0}

!0 = metadata !{metadata !"clang version 3.4 (trunk 193120)"}

Any ideas why the vectotizer doesn't like my code?

Frank

Frank,

It looks like the loop vectorizer is unable to tell that the two stores in your code never overlap. This is probably because of the sign-extend in your code. Can you extend the indices to 64bit ?

Thanks,
Nadav

Hi Nadav,

right! The sign-extend was the problem. Hmm.. Is this a bug or a feature?

Frank