Understanding Loop Vectorized IR

Hello,
For the C code given below:

#include<stdio.h>
int a=0;
int d() {
  int e = 2;
  for (a = 0; a <= 8; a++)
    ;
  return e;
}
void main() {
  int f = 0;
  d();
  printf("%d\n",a);
}
$clang -O3 -c -emit-llvm  -mllvm -disable-llvm-optzns small.c

$opt -gvn -licm -loop-rotate -loop-vectorize   small.bc -o small-opt.bc

I see vectorized IR as follow:
; Function Attrs: nounwind uwtable
define dso_local i32 @d() #0 {
entry:
  %e = alloca i32, align 4
  %0 = bitcast i32* %e to i8*
  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #3
  store i32 2, i32* %e, align 4, !tbaa !2
  store i32 0, i32* @a, align 4, !tbaa !2
  %a.promoted = load i32, i32* @a, align 4, !tbaa !2
  br i1 false, label %[scalar.ph](http://scalar.ph), label %[vector.ph](http://vector.ph)

[vector.ph](http://vector.ph):                                        ; preds = %entry
  %vector.recur.init = insertelement <16 x i32> undef, i32 %a.promoted, i32 15
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %[vector.ph](http://vector.ph)
  %index = phi i32 [ 0, %[vector.ph](http://vector.ph) ], [ %index.next, %vector.body ]
  %vector.recur = phi <16 x i32> [ %vector.recur.init, %[vector.ph](http://vector.ph) ], [ %17, %vector.body ]
  %vec.ind = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, %[vector.ph](http://vector.ph) ], [ %vec.ind.next, %vector.body ]
  %1 = add i32 %index, 0
  %2 = add i32 %index, 1
  %3 = add i32 %index, 2
  %4 = add i32 %index, 3
  %5 = add i32 %index, 4
  %6 = add i32 %index, 5
  %7 = add i32 %index, 6
  %8 = add i32 %index, 7
  %9 = add i32 %index, 8
  %10 = add i32 %index, 9
  %11 = add i32 %index, 10
  %12 = add i32 %index, 11
  %13 = add i32 %index, 12
  %14 = add i32 %index, 13
  %15 = add i32 %index, 14
  %16 = add i32 %index, 15
  %17 = add nsw <16 x i32> %vec.ind, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %18 = shufflevector <16 x i32> %vector.recur, <16 x i32> %17, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
  %19 = icmp ule <16 x i32> %vec.ind, <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
  %index.next = add i32 %index, 16
  %vec.ind.next = add <16 x i32> %vec.ind, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %20 = icmp eq i32 %index.next, 16
  br i1 %20, label %middle.block, label %vector.body, !llvm.loop !6

middle.block:                                     ; preds = %vector.body
  %vector.recur.extract = extractelement <16 x i32> %17, i32 15
  %vector.recur.extract.for.phi = extractelement <16 x i32> %17, i32 14
  br i1 true, label %for.end, label %[scalar.ph](http://scalar.ph)

[scalar.ph](http://scalar.ph):                                        ; preds = %middle.block, %entry
  %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %a.promoted, %entry ]
  %bc.resume.val = phi i32 [ 16, %middle.block ], [ 0, %entry ]
  br label %for.cond

for.cond:                                         ; preds = %for.cond, %[scalar.ph](http://scalar.ph)
  %scalar.recur = phi i32 [ %inc, %for.cond ], [ %scalar.recur.init, %[scalar.ph](http://scalar.ph) ]
  %21 = phi i32 [ %inc, %for.cond ], [ %bc.resume.val, %[scalar.ph](http://scalar.ph) ]
  %cmp = icmp sle i32 %21, 8
  %inc = add nsw i32 %21, 1
  br i1 %cmp, label %for.cond, label %for.end, !llvm.loop !8

for.end:                                          ; preds = %middle.block, %for.cond
  %inc1.lcssa = phi i32 [ %scalar.recur, %for.cond ], [ %vector.recur.extract.for.phi, %middle.block ]
  store i32 %inc1.lcssa, i32* @a, align 4, !tbaa !2
  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0) #3
  ret i32 2
}

As highlighted above few instructions result are never used can someone explain why?

Also This gives output (variable a) 15 which is incorrect as output should 9. However I don't see any problem with vectorized code and hence a = 15 is not surprising.

So solution to this problem is that loop should have never vectorized?

-Vivek

I’m no loop vectorizer expert but it seems that it relies on instcombine to remove redundant instructions. Adding -instcombine at the end of the opt pipeline gets rid of the instructions.

Alexey