loop vectorizer: Unexpected extract/insertelement

The following IR implements the following nested loop:

for (int i = start ; i < end ; ++i )
   for (int p = 0 ; p < 4 ; ++p )
     a[i*4+p] = b[i*4+p] + c[i*4+p];

define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float* noalias %arg4, float* noalias %arg5, float* noalias %arg6) {
entrypoint:
   br i1 %arg2, label %L0, label %L1

L0: ; preds = %entrypoint
   %0 = add nsw i64 %arg0, %arg3
   %1 = add nsw i64 %arg1, %arg3
   br label %L2

L1: ; preds = %entrypoint
   br label %L2

L2: ; preds = %L0, %L1
   %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ]
   %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ]
   %4 = sdiv i64 %2, 4
   %5 = sdiv i64 %3, 4
   br label %L5

L3: ; preds = %L3, %L5
   %6 = phi i64 [ %15, %L3 ], [ 0, %L5 ]
   %7 = mul i64 %19, 4
   %8 = add nsw i64 %7, %6
   %9 = getelementptr float* %arg5, i64 %8
   %10 = load float* %9
   %11 = getelementptr float* %arg6, i64 %8
   %12 = load float* %11
   %13 = fadd float %12, %10
   %14 = getelementptr float* %arg4, i64 %8
   store float %13, float* %14
   %15 = add nsw i64 %6, 1
   %16 = icmp sge i64 %15, 4
   br i1 %16, label %L4, label %L3

L4: ; preds = %L3
   %17 = add nsw i64 %19, 1
   %18 = icmp sge i64 %17, %5
   br i1 %18, label %L6, label %L5

L5: ; preds = %L4, %L2
   %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ]
   br label %L3

L6: ; preds = %L4
   ret void
}

L3 is the inner loop with constant trip count 4.

When calling the loop vectorizer,

opt -loop-vectorize -debug-only=loop-vectorize -vectorizer-min-trip-count 4 loop4.ll -S

LV:
Checking a loop in "main"
LV: Found a loop: L3
LV: Found an induction variable.
LV: We need to do 0 pointer comparisons.
LV: We don't need a runtime memory check.
LV: We can vectorize this loop!
LV: Found trip count: 4
LV: The Widest type: 32 bits.
LV: The Widest register is: 128 bits.
....
LV: Vector loop of width 4 costs: 7.
LV: Selecting VF = : 4.
LV: Found a vectorizable loop (4) in loop4.ll
LV: Unroll Factor is 1

we can see that the loop was vectorized. However, the code that is produced doesn't look too good:

define void @main(i64 %arg0, i64 %arg1, i1 %arg2, i64 %arg3, float* noalias %arg4, float* noalias %arg5, float* noalias %arg6) {
entrypoint:
   br i1 %arg2, label %L0, label %L1

L0: ; preds = %entrypoint
   %0 = add nsw i64 %arg0, %arg3
   %1 = add nsw i64 %arg1, %arg3
   br label %L2

L1: ; preds = %entrypoint
   br label %L2

L2: ; preds = %L1, %L0
   %2 = phi i64 [ %arg0, %L1 ], [ %0, %L0 ]
   %3 = phi i64 [ %arg1, %L1 ], [ %1, %L0 ]
   %4 = sdiv i64 %2, 4
   %5 = sdiv i64 %3, 4
   br label %L5

L3: ; preds = %scalar.ph, %L3
   %6 = phi i64 [ %15, %L3 ], [ %trunc.resume.val, %scalar.ph ]
   %7 = mul i64 %19, 4
   %8 = add nsw i64 %7, %6
   %9 = getelementptr float* %arg5, i64 %8
   %10 = load float* %9
   %11 = getelementptr float* %arg6, i64 %8
   %12 = load float* %11
   %13 = fadd float %12, %10
   %14 = getelementptr float* %arg4, i64 %8
   store float %13, float* %14
   %15 = add nsw i64 %6, 1
   %16 = icmp sge i64 %15, 4
   br i1 %16, label %L4, label %L3, !llvm.loop !0

L4: ; preds = %middle.block, %L3
   %17 = add nsw i64 %19, 1
   %18 = icmp sge i64 %17, %5
   br i1 %18, label %L6, label %L5

L5: ; preds = %L4, %L2
   %19 = phi i64 [ %17, %L4 ], [ %4, %L2 ]
   br i1 false, label %middle.block, label %vector.ph

vector.ph: ; preds = %L5
   %broadcast.splatinsert1 = insertelement <4 x i64> undef, i64 %19, i32 0
   %broadcast.splat2 = shufflevector <4 x i64> %broadcast.splatinsert1, <4 x i64> undef, <4 x i32> zeroinitializer
   br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0
   %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3>
   %20 = mul <4 x i64> %broadcast.splat2, <i64 4, i64 4, i64 4, i64 4>
   %21 = add nsw <4 x i64> %20, %induction
   %22 = extractelement <4 x i64> %21, i32 0
   %23 = getelementptr float* %arg5, i64 %22
   %24 = insertelement <4 x float*> undef, float* %23, i32 0
   %25 = extractelement <4 x i64> %21, i32 1
   %26 = getelementptr float* %arg5, i64 %25
   %27 = insertelement <4 x float*> %24, float* %26, i32 1
   %28 = extractelement <4 x i64> %21, i32 2
   %29 = getelementptr float* %arg5, i64 %28
   %30 = insertelement <4 x float*> %27, float* %29, i32 2
   %31 = extractelement <4 x i64> %21, i32 3
   %32 = getelementptr float* %arg5, i64 %31
   %33 = insertelement <4 x float*> %30, float* %32, i32 3
   %34 = extractelement <4 x i64> %21, i32 0
   %35 = getelementptr float* %arg5, i64 %34
   %36 = getelementptr float* %35, i32 0
   %37 = bitcast float* %36 to <4 x float>*
   %wide.load = load <4 x float>* %37
   %38 = extractelement <4 x i64> %21, i32 0
   %39 = getelementptr float* %arg6, i64 %38
   %40 = insertelement <4 x float*> undef, float* %39, i32 0
   %41 = extractelement <4 x i64> %21, i32 1
   %42 = getelementptr float* %arg6, i64 %41
   %43 = insertelement <4 x float*> %40, float* %42, i32 1
   %44 = extractelement <4 x i64> %21, i32 2
   %45 = getelementptr float* %arg6, i64 %44
   %46 = insertelement <4 x float*> %43, float* %45, i32 2
   %47 = extractelement <4 x i64> %21, i32 3
   %48 = getelementptr float* %arg6, i64 %47
   %49 = insertelement <4 x float*> %46, float* %48, i32 3
   %50 = extractelement <4 x i64> %21, i32 0
   %51 = getelementptr float* %arg6, i64 %50
   %52 = getelementptr float* %51, i32 0
   %53 = bitcast float* %52 to <4 x float>*
   %wide.load3 = load <4 x float>* %53
   %54 = fadd <4 x float> %wide.load3, %wide.load
   %55 = extractelement <4 x i64> %21, i32 0
   %56 = getelementptr float* %arg4, i64 %55
   %57 = insertelement <4 x float*> undef, float* %56, i32 0
   %58 = extractelement <4 x i64> %21, i32 1
   %59 = getelementptr float* %arg4, i64 %58
   %60 = insertelement <4 x float*> %57, float* %59, i32 1
   %61 = extractelement <4 x i64> %21, i32 2
   %62 = getelementptr float* %arg4, i64 %61
   %63 = insertelement <4 x float*> %60, float* %62, i32 2
   %64 = extractelement <4 x i64> %21, i32 3
   %65 = getelementptr float* %arg4, i64 %64
   %66 = insertelement <4 x float*> %63, float* %65, i32 3
   %67 = extractelement <4 x i64> %21, i32 0
   %68 = getelementptr float* %arg4, i64 %67
   %69 = getelementptr float* %68, i32 0
   %70 = bitcast float* %69 to <4 x float>*
   store <4 x float> %54, <4 x float>* %70
   %71 = add nsw <4 x i64> %induction, <i64 1, i64 1, i64 1, i64 1>
   %72 = icmp sge <4 x i64> %71, <i64 4, i64 4, i64 4, i64 4>
   %index.next = add i64 %index, 4
   %73 = icmp eq i64 %index.next, 4
   br i1 %73, label %middle.block, label %vector.body

middle.block: ; preds = %vector.body, %L5
   %resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ]
   %trunc.resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ]
   %cmp.n = icmp eq i64 4, %resume.val
   br i1 %cmp.n, label %L4, label %scalar.ph

scalar.ph: ; preds = %middle.block
   br label %L3

L6: ; preds = %L4
   ret void
}

Why did the loop vectorizer produced so many insertelement and extractelement instructions?

I don't remember that those instructions entered when vectorizing other loops. Is this harmless? Which pass can clean up these instructions?

Frank

The loop vectorizer relies on cleanup passes to be run after it:

from Transforms/IPO/PassManagerBuilder.cpp:

    // Add the various vectorization passes and relevant cleanup passes for
    // them since we are no longer in the middle of the main scalar pipeline.
    MPM.add(createLoopVectorizePass(DisableUnrollLoops));
    MPM.add(createInstructionCombiningPass());
    MPM.add(createCFGSimplificationPass());

The instcombine pass cleans up a lot.

Any idea why there are still shufflevector, insertelement, *and* bitcast (!!) etc. instructions left? The original loop is so clean, a textbook example I'd say. There is no need to shuffle anything.At least I don't see it.

Frank

vector.ph: ; preds = %L5
   %broadcast.splatinsert1 = insertelement <4 x i64> undef, i64 %19, i32 0
   %broadcast.splat2 = shufflevector <4 x i64> %broadcast.splatinsert1, <4 x i64> undef, <4 x i32> zeroinitializer
   br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0
   %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3>
   %20 = shl <4 x i64> %broadcast.splat2, <i64 2, i64 2, i64 2, i64 2>
   %21 = add nsw <4 x i64> %20, %induction
   %22 = extractelement <4 x i64> %21, i32 0
   %23 = getelementptr float* %arg5, i64 %22
   %24 = bitcast float* %23 to <4 x float>*
   %wide.load = load <4 x float>* %24, align 16
   %25 = extractelement <4 x i64> %21, i32 0
   %26 = getelementptr float* %arg6, i64 %25
   %27 = bitcast float* %26 to <4 x float>*
   %wide.load3 = load <4 x float>* %27, align 16
   %28 = fadd <4 x float> %wide.load3, %wide.load
   %29 = extractelement <4 x i64> %21, i32 0
   %30 = getelementptr float* %arg4, i64 %29
   %31 = bitcast float* %30 to <4 x float>*
   store <4 x float> %28, <4 x float>* %31, align 16
   %index.next = add i64 %index, 4
   %32 = icmp eq i64 %index, 0
   br i1 %32, label %middle.block, label %vector.body

middle.block: ; preds = %vector.body, %L5
   %resume.val = phi i1 [ false, %L5 ], [ true, %vector.body ]
   %trunc.resume.val = phi i64 [ 0, %L5 ], [ 4, %vector.body ]
   br i1 %resume.val, label %L4, label %scalar.ph

scalar.ph: ; preds = %middle.block
   br label %L3

Yes, you need the latest ToT version of llvm or you run

-loop-vectorize -earlycse -instcombine -simplifycfg

The bitcast essentially is a noop to satisfy the type system.

This is how your example looks like for me:

vector.body: ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %.lhs = shl i64 %6, 2
  %7 = add i64 %.lhs, %index
  %8 = getelementptr float* %arg5, i64 %7
  %9 = bitcast float* %8 to <4 x float>*
  %wide.load = load <4 x float>* %9, align 16
  %10 = getelementptr float* %arg6, i64 %7
  %11 = bitcast float* %10 to <4 x float>*
  %wide.load3 = load <4 x float>* %11, align 16
  %12 = fadd <4 x float> %wide.load3, %wide.load
  %13 = getelementptr float* %arg4, i64 %7
  %14 = bitcast float* %13 to <4 x float>*
  store <4 x float> %12, <4 x float>* %14, align 16
  %index.next = add i64 %index, 4
  %15 = icmp eq i64 %index, 0
  br i1 %15, label %middle.block, label %vector.body

Even if the subsequent passes don't clean up everything, the back-ends will
understand those patterns as specific vector instructions. Actually, if the
casts and shuffles are not there, you'll end up with the wrong instruction
being selected (or just broken IR).

cheers,
--renato