loop vectorizer and storing to uniform addresses

I am trying my luck on this global reduction kernel:

float foo( int start , int end , float * A )
{
   float sum[4] = {0.,0.,0.,0.};
   for (int i = start ; i < end ; ++i ) {
     for (int q = 0 ; q < 4 ; ++q )
       sum[q] += A[i*4+q];
   }
   return sum[0]+sum[1]+sum[2]+sum[3];
}

LV: Checking a loop in "foo"
LV: Found a loop: for.cond1
LV: Found an induction variable.
LV: We don't allow storing to uniform addresses
LV: Can't vectorize due to memory conflicts
LV: Not vectorizing.

My interpretation would be that the loop vectorizer does not support loops which implement a reduction. Is this correct?

Frank

Can you attach the incoming IR ?

Yes, we support reductions: http://llvm.org/docs/Vectorizers.html

This is triggering because it didn't recognize as a reduction variable
during the canVectorizeInstrs() but did recognize that sum[q] is loop
invariant in canVectorizeMemory().

I'm guessing the nested loop was unrolled because of the low trip-count,
and removed, so it ended up as:

float foo( int start , int end , float * A )
{
  float sum[4] = {0.,0.,0.,0.};
  for (int i = start ; i < end ; ++i ) {
    sum[0] += A[i*4+0];
    sum[1] += A[i*4+1];
    sum[2] += A[i*4+2];
    sum[3] += A[i*4+3];
  }
  return sum[0]+sum[1]+sum[2]+sum[3];
}

but, for some reason, sum[q] wasn't recognized as a reduction variable,
maybe because it was an array of reduction variables?

Having the IR would certainly help...

cheers,
--renato

I changed the input C to using a 64 bit type for the loop index (this eliminates 'sext' instructions in the IR)

Here the IR produced with clang -O0

define float @foo(i64 %start, i64 %end, float* %A) #0 {
entry:
   %start.addr = alloca i64, align 8
   %end.addr = alloca i64, align 8
   %A.addr = alloca float*, align 8
   %sum = alloca [4 x float], align 16
   %i = alloca i64, align 8
   %q = alloca i64, align 8
   store i64 %start, i64* %start.addr, align 8
   store i64 %end, i64* %end.addr, align 8
   store float* %A, float** %A.addr, align 8
   %0 = bitcast [4 x float]* %sum to i8*
   call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 16, i32 16, i1 false)
   %1 = load i64* %start.addr, align 8
   store i64 %1, i64* %i, align 8
   br label %for.cond

for.cond: ; preds = %for.inc6, %entry
   %2 = load i64* %i, align 8
   %3 = load i64* %end.addr, align 8
   %cmp = icmp slt i64 %2, %3
   br i1 %cmp, label %for.body, label %for.end8

for.body: ; preds = %for.cond
   store i64 0, i64* %q, align 8
   br label %for.cond1

for.cond1: ; preds = %for.inc, %for.body
   %4 = load i64* %q, align 8
   %cmp2 = icmp slt i64 %4, 4
   br i1 %cmp2, label %for.body3, label %for.end

for.body3: ; preds = %for.cond1
   %5 = load i64* %i, align 8
   %mul = mul nsw i64 %5, 4
   %6 = load i64* %q, align 8
   %add = add nsw i64 %mul, %6
   %7 = load float** %A.addr, align 8
   %arrayidx = getelementptr inbounds float* %7, i64 %add
   %8 = load float* %arrayidx, align 4
   %9 = load i64* %q, align 8
   %arrayidx4 = getelementptr inbounds [4 x float]* %sum, i32 0, i64 %9
   %10 = load float* %arrayidx4, align 4
   %add5 = fadd float %10, %8
   store float %add5, float* %arrayidx4, align 4
   br label %for.inc

for.inc: ; preds = %for.body3
   %11 = load i64* %q, align 8
   %inc = add nsw i64 %11, 1
   store i64 %inc, i64* %q, align 8
   br label %for.cond1

for.end: ; preds = %for.cond1
   br label %for.inc6

for.inc6: ; preds = %for.end
   %12 = load i64* %i, align 8
   %inc7 = add nsw i64 %12, 1
   store i64 %inc7, i64* %i, align 8
   br label %for.cond

for.end8: ; preds = %for.cond
   %arrayidx9 = getelementptr inbounds [4 x float]* %sum, i32 0, i64 0
   %13 = load float* %arrayidx9, align 4
   %arrayidx10 = getelementptr inbounds [4 x float]* %sum, i32 0, i64 1
   %14 = load float* %arrayidx10, align 4
   %add11 = fadd float %13, %14
   %arrayidx12 = getelementptr inbounds [4 x float]* %sum, i32 0, i64 2
   %15 = load float* %arrayidx12, align 4
   %add13 = fadd float %add11, %15
   %arrayidx14 = getelementptr inbounds [4 x float]* %sum, i32 0, i64 3
   %16 = load float* %arrayidx14, align 4
   %add15 = fadd float %add13, %16
   ret float %add15
}

Thus, the inner loop is not unrolled.

opt -basicaa -loop-vectorize -debug-only=loop-vectorize -vectorizer-min-trip-count=4 -S sum.ll

LV: Checking a loop in "foo"
LV: Found a loop: for.cond1
LV: SCEV could not compute the loop exit count.
LV: Not vectorizing.

opt -basicaa -gvn -loop-vectorize -debug-only=loop-vectorize -vectorizer-min-trip-count=4 -S sum.ll

LV: Checking a loop in "foo"
LV: Found a loop: for.cond1
LV: Found an induction variable.
LV: We don't allow storing to uniform addresses
LV: Can't vectorize due to memory conflicts
LV: Not vectorizing.

Frank

You did not run mem2reg before running the vectorizer. All of your variables are still in allocas. You need to run the standard llvm optimization pipe (or an approximation of it) before running the vectorizer.