Enabling vectorization with LLVM 3.3 for a DSL emitting LLVM IR

Hi,

Our DSL can generate C or directly generate LLVM IR. With LLVM 3.3, we can vectorize the C produced code using clang with -O3, or clang with -O1 then opt -O3 -vectorize-loops. But the same program generating LLVM IR version cannot be vectorized with opt -O3 -vectorize-loops. So our guess is that our generated LLVM IR lacks some informations that are needed by the vectorization passes to correctly work.

Any idea of what could be lacking?

Thanks

Stéphane Letz

Without any knowledge about the code guessing is hard. You may miss the 'noalias' keyword or nsw/nuw flags, but there are many possibilities.

If you add '-debug' to opt you may get some hints. Also, if you have a small test case, posting the LLVM-IR may help.

Cheers,
Tobias

Hi Tobias,

1) Here is a simple C loop generated by our C backend:

void computemydsp(mydsp* dsp, int count, float** inputs, float** outputs) {
  float* input0 = inputs[0];
  float* input1 = inputs[1];
  float* output0 = outputs[0];
  /* C99 loop */
  {
    int i;
    for (i = 0; (i < count); i = (i + 1)) {
      output0[i] = (float)((float)input0[i] + (float)input1[i]);
      
    }
  }
}

2) Compiling it with "clang -O3" vectorize it directly:

define void @computemydsp(%struct.mydsp* nocapture %dsp, i32 %count, float** nocapture %inputs, float** nocapture %outputs) #0 {
entry:
  %0 = load float** %inputs, align 8, !tbaa !3
  %arrayidx1 = getelementptr inbounds float** %inputs, i64 1
  %1 = load float** %arrayidx1, align 8, !tbaa !3
  %2 = load float** %outputs, align 8, !tbaa !3
  %cmp14 = icmp sgt i32 %count, 0
  br i1 %cmp14, label %for.body.lr.ph, label %for.end

for.body.lr.ph: ; preds = %entry
  %cnt.cast = zext i32 %count to i64
  %n.vec = and i64 %cnt.cast, 4294967288
  %cmp.zero = icmp eq i64 %n.vec, 0
  %3 = add i32 %count, -1
  %4 = zext i32 %3 to i64
  %scevgep = getelementptr float* %2, i64 %4
  br i1 %cmp.zero, label %middle.block, label %vector.memcheck

vector.memcheck: ; preds = %for.body.lr.ph
  %scevgep19 = getelementptr float* %1, i64 %4
  %scevgep17 = getelementptr float* %0, i64 %4
  %bound122 = icmp ule float* %1, %scevgep
  %bound021 = icmp ule float* %2, %scevgep19
  %bound1 = icmp ule float* %0, %scevgep
  %bound0 = icmp ule float* %2, %scevgep17
  %found.conflict23 = and i1 %bound021, %bound122
  %found.conflict = and i1 %bound0, %bound1
  %conflict.rdx = or i1 %found.conflict, %found.conflict23
  br i1 %conflict.rdx, label %middle.block, label %vector.body

vector.body: ; preds = %vector.memcheck, %vector.body
  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.memcheck ]
  %5 = getelementptr inbounds float* %0, i64 %index
  %6 = bitcast float* %5 to <4 x float>*
  %wide.load = load <4 x float>* %6, align 4
  %.sum32 = or i64 %index, 4
  %7 = getelementptr float* %0, i64 %.sum32
  %8 = bitcast float* %7 to <4 x float>*
  %wide.load25 = load <4 x float>* %8, align 4
  %9 = getelementptr inbounds float* %1, i64 %index
  %10 = bitcast float* %9 to <4 x float>*
  %wide.load26 = load <4 x float>* %10, align 4
  %.sum33 = or i64 %index, 4
  %11 = getelementptr float* %1, i64 %.sum33
  %12 = bitcast float* %11 to <4 x float>*
  %wide.load27 = load <4 x float>* %12, align 4
  %13 = fadd <4 x float> %wide.load, %wide.load26
  %14 = fadd <4 x float> %wide.load25, %wide.load27
  %15 = getelementptr inbounds float* %2, i64 %index
  %16 = bitcast float* %15 to <4 x float>*
  store <4 x float> %13, <4 x float>* %16, align 4
  %.sum34 = or i64 %index, 4
  %17 = getelementptr float* %2, i64 %.sum34
  %18 = bitcast float* %17 to <4 x float>*
  store <4 x float> %14, <4 x float>* %18, align 4
  %index.next = add i64 %index, 8
  %19 = icmp eq i64 %index.next, %n.vec
  br i1 %19, label %middle.block, label %vector.body

middle.block: ; preds = %vector.body, %vector.memcheck, %for.body.lr.ph
  %resume.val = phi i64 [ 0, %for.body.lr.ph ], [ 0, %vector.memcheck ], [ %n.vec, %vector.body ]
  %cmp.n = icmp eq i64 %cnt.cast, %resume.val
  br i1 %cmp.n, label %for.end, label %for.body

for.body: ; preds = %middle.block, %for.body
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %resume.val, %middle.block ]
  %arrayidx3 = getelementptr inbounds float* %0, i64 %indvars.iv
  %20 = load float* %arrayidx3, align 4, !tbaa !4
  %arrayidx5 = getelementptr inbounds float* %1, i64 %indvars.iv
  %21 = load float* %arrayidx5, align 4, !tbaa !4
  %add = fadd float %20, %21
  %arrayidx7 = getelementptr inbounds float* %2, i64 %indvars.iv
  store float %add, float* %arrayidx7, align 4, !tbaa !4
  %indvars.iv.next = add i64 %indvars.iv, 1
  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  %exitcond = icmp eq i32 %lftr.wideiv, %count
  br i1 %exitcond, label %for.end, label %for.body, !llvm.vectorizer.already_vectorized !5

for.end: ; preds = %middle.block, %for.body, %entry
  ret void
}

; Function Attrs: nounwind ssp uwtable
define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
entry:
  ret i32 0
}

3) compiling it with "clang -O1"

; Function Attrs: nounwind ssp uwtable
define void @computemydsp(%struct.mydsp* nocapture %dsp, i32 %count, float** nocapture %inputs, float** nocapture %outputs) #0 {
entry:
  %0 = load float** %inputs, align 8, !tbaa !3
  %arrayidx1 = getelementptr inbounds float** %inputs, i64 1
  %1 = load float** %arrayidx1, align 8, !tbaa !3
  %2 = load float** %outputs, align 8, !tbaa !3
  %cmp14 = icmp sgt i32 %count, 0
  br i1 %cmp14, label %for.body, label %for.end

for.body: ; preds = %entry, %for.body
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
  %arrayidx3 = getelementptr inbounds float* %0, i64 %indvars.iv
  %3 = load float* %arrayidx3, align 4, !tbaa !4
  %arrayidx5 = getelementptr inbounds float* %1, i64 %indvars.iv
  %4 = load float* %arrayidx5, align 4, !tbaa !4
  %add = fadd float %3, %4
  %arrayidx7 = getelementptr inbounds float* %2, i64 %indvars.iv
  store float %add, float* %arrayidx7, align 4, !tbaa !4
  %indvars.iv.next = add i64 %indvars.iv, 1
  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  %exitcond = icmp eq i32 %lftr.wideiv, %count
  br i1 %exitcond, label %for.end, label %for.body

for.end: ; preds = %for.body, %entry
  ret void
}

4) then using "opt -o3 -vectorize-loops" vectorize it:

; Function Attrs: nounwind ssp uwtable
define void @computemydsp(%struct.mydsp* nocapture %dsp, i32 %count, float** nocapture %inputs, float** nocapture %outputs) #0 {
entry:
  %0 = load float** %inputs, align 8, !tbaa !3
  %arrayidx1 = getelementptr inbounds float** %inputs, i64 1
  %1 = load float** %arrayidx1, align 8, !tbaa !3
  %2 = load float** %outputs, align 8, !tbaa !3
  %cmp14 = icmp sgt i32 %count, 0
  br i1 %cmp14, label %for.body.preheader, label %for.end

for.body.preheader: ; preds = %entry
  %cnt.cast = zext i32 %count to i64
  %3 = urem i32 %count, 24
  %n.mod.vf = zext i32 %3 to i64
  %n.vec = sub i64 %cnt.cast, %n.mod.vf
  %cmp.zero = icmp eq i32 %3, %count
  %4 = add i32 %count, -1
  %5 = zext i32 %4 to i64
  %scevgep = getelementptr float* %2, i64 %5
  br i1 %cmp.zero, label %middle.block, label %vector.memcheck

vector.memcheck: ; preds = %for.body.preheader
  %scevgep6 = getelementptr float* %1, i64 %5
  %scevgep4 = getelementptr float* %0, i64 %5
  %bound19 = icmp ule float* %1, %scevgep
  %bound08 = icmp ule float* %2, %scevgep6
  %bound1 = icmp ule float* %0, %scevgep
  %bound0 = icmp ule float* %2, %scevgep4
  %found.conflict10 = and i1 %bound08, %bound19
  %found.conflict = and i1 %bound0, %bound1
  %conflict.rdx = or i1 %found.conflict, %found.conflict10
  br i1 %conflict.rdx, label %middle.block, label %vector.body

vector.body: ; preds = %vector.memcheck, %vector.body
  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.memcheck ]
  %6 = getelementptr inbounds float* %0, i64 %index
  %7 = bitcast float* %6 to <8 x float>*
  %wide.load = load <8 x float>* %7, align 4
  %.sum = add i64 %index, 8
  %8 = getelementptr float* %0, i64 %.sum
  %9 = bitcast float* %8 to <8 x float>*
  %wide.load13 = load <8 x float>* %9, align 4
  %.sum23 = add i64 %index, 16
  %10 = getelementptr float* %0, i64 %.sum23
  %11 = bitcast float* %10 to <8 x float>*
  %wide.load14 = load <8 x float>* %11, align 4
  %12 = getelementptr inbounds float* %1, i64 %index
  %13 = bitcast float* %12 to <8 x float>*
  %wide.load15 = load <8 x float>* %13, align 4
  %.sum24 = add i64 %index, 8
  %14 = getelementptr float* %1, i64 %.sum24
  %15 = bitcast float* %14 to <8 x float>*
  %wide.load16 = load <8 x float>* %15, align 4
  %.sum25 = add i64 %index, 16
  %16 = getelementptr float* %1, i64 %.sum25
  %17 = bitcast float* %16 to <8 x float>*
  %wide.load17 = load <8 x float>* %17, align 4
  %18 = fadd <8 x float> %wide.load, %wide.load15
  %19 = fadd <8 x float> %wide.load13, %wide.load16
  %20 = fadd <8 x float> %wide.load14, %wide.load17
  %21 = getelementptr inbounds float* %2, i64 %index
  %22 = bitcast float* %21 to <8 x float>*
  store <8 x float> %18, <8 x float>* %22, align 4
  %.sum26 = add i64 %index, 8
  %23 = getelementptr float* %2, i64 %.sum26
  %24 = bitcast float* %23 to <8 x float>*
  store <8 x float> %19, <8 x float>* %24, align 4
  %.sum27 = add i64 %index, 16
  %25 = getelementptr float* %2, i64 %.sum27
  %26 = bitcast float* %25 to <8 x float>*
  store <8 x float> %20, <8 x float>* %26, align 4
  %index.next = add i64 %index, 24
  %27 = icmp eq i64 %index.next, %n.vec
  br i1 %27, label %middle.block, label %vector.body

middle.block: ; preds = %vector.body, %vector.memcheck, %for.body.preheader
  %resume.val = phi i64 [ 0, %for.body.preheader ], [ 0, %vector.memcheck ], [ %n.vec, %vector.body ]
  %cmp.n = icmp eq i64 %cnt.cast, %resume.val
  br i1 %cmp.n, label %for.end, label %for.body

for.body: ; preds = %middle.block, %for.body
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %resume.val, %middle.block ]
  %arrayidx3 = getelementptr inbounds float* %0, i64 %indvars.iv
  %28 = load float* %arrayidx3, align 4, !tbaa !4
  %arrayidx5 = getelementptr inbounds float* %1, i64 %indvars.iv
  %29 = load float* %arrayidx5, align 4, !tbaa !4
  %add = fadd float %28, %29
  %arrayidx7 = getelementptr inbounds float* %2, i64 %indvars.iv
  store float %add, float* %arrayidx7, align 4, !tbaa !4
  %indvars.iv.next = add i64 %indvars.iv, 1
  %lftr.wideiv1 = trunc i64 %indvars.iv.next to i32
  %exitcond2 = icmp eq i32 %lftr.wideiv1, %count
  br i1 %exitcond2, label %for.end, label %for.body, !llvm.vectorizer.already_vectorized !5

for.end: ; preds = %middle.block, %for.body, %entry
  ret void
}

5) producing LLVM IR with our LLVM backend :

define void @compute_mydsp(%struct.dsp_mydsp* %dsp, i32 %fullcount, float** noalias %inputs, float** noalias %outputs) {
block_code:
  br label %code_block

code_block: ; preds = %block_code
  %0 = getelementptr inbounds float** %inputs, i32 0
  %1 = load float** %0
  %2 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 0
  store float* %1, float** %2
  %fInput0 = alloca float*
  %3 = getelementptr inbounds float** %inputs, i32 1
  %4 = load float** %3
  %5 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 1
  store float* %4, float** %5
  %fInput1 = alloca float*
  %6 = getelementptr inbounds float** %outputs, i32 0
  %7 = load float** %6
  %8 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 2
  store float* %7, float** %8
  %fOutput0 = alloca float*
  br label %init_block

init_block: ; preds = %code_block
  %index = alloca i32
  store i32 0, i32* %index
  br label %exec_block

exec_block: ; preds = %exit_block6, %init_block
  %index1 = phi i32 [ 0, %init_block ], [ %next_index9, %exit_block6 ]
  %9 = load i32* %index
  %10 = icmp slt i32 %9, %fullcount
  %11 = select i1 %10, i32 1, i32 0
  %12 = trunc i32 %11 to i1
  br i1 %12, label %loop_body_block, label %exit_block

loop_body_block: ; preds = %exec_block
  br label %code_block2

exit_block: ; preds = %exec_block
  br label %return

code_block2: ; preds = %loop_body_block
  %13 = load i32* %index
  %14 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 0
  %15 = load float** %14
  %16 = getelementptr inbounds float* %15, i32 %13
  store float* %16, float** %fInput0
  %17 = load i32* %index
  %18 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 1
  %19 = load float** %18
  %20 = getelementptr inbounds float* %19, i32 %17
  store float* %20, float** %fInput1
  %21 = load i32* %index
  %22 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 2
  %23 = load float** %22
  %24 = getelementptr inbounds float* %23, i32 %21
  store float* %24, float** %fOutput0
  %count = alloca i32
  %25 = load i32* %index
  %26 = sub i32 %fullcount, %25
  %27 = icmp slt i32 32, %26
  %28 = select i1 %27, i32 32, i32 %26
  store i32 %28, i32* %count
  br label %init_block3

init_block3: ; preds = %code_block2
  %i = alloca i32
  store i32 0, i32* %i
  br label %exec_block4

exec_block4: ; preds = %code_block8, %init_block3
  %i7 = phi i32 [ 0, %init_block3 ], [ %next_index, %code_block8 ]
  %29 = load i32* %i
  %30 = load i32* %count
  %31 = icmp slt i32 %29, %30
  %32 = select i1 %31, i32 1, i32 0
  %33 = trunc i32 %32 to i1
  br i1 %33, label %loop_body_block5, label %exit_block6

loop_body_block5: ; preds = %exec_block4
  br label %code_block8

exit_block6: ; preds = %exec_block4
  %34 = load i32* %index
  %next_index9 = add i32 %34, 32
  store i32 %next_index9, i32* %index
  br label %exec_block

code_block8: ; preds = %loop_body_block5
  %35 = load i32* %i
  %36 = load float** %fOutput0
  %37 = getelementptr inbounds float* %36, i32 %35
  %38 = load i32* %i
  %39 = load float** %fInput0
  %40 = getelementptr inbounds float* %39, i32 %38
  %41 = load float* %40
  %42 = load i32* %i
  %43 = load float** %fInput1
  %44 = getelementptr inbounds float* %43, i32 %42
  %45 = load float* %44
  %46 = fadd float %41, %45
  store float %46, float* %37
  %47 = load i32* %i
  %next_index = add i32 %47, 1
  store i32 %next_index, i32* %i
  br label %exec_block4

return: ; preds = %exit_block
  ret void
}

6) Then using "opt -o3 -vectorize-loops" *does not* vectorize it:

; Function Attrs: nounwind
define void @compute_mydsp(%struct.dsp_mydsp* nocapture %dsp, i32 %fullcount, float** noalias nocapture %inputs, float** noalias nocapture %outputs) #0 {
block_code:
  %0 = load float** %inputs
  %1 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 0
  store float* %0, float** %1
  %2 = getelementptr inbounds float** %inputs, i32 1
  %3 = load float** %2
  %4 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 1
  store float* %3, float** %4
  %5 = load float** %outputs
  %6 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i32 0, i32 2
  store float* %5, float** %6
  %7 = icmp sgt i32 %fullcount, 0
  br i1 %7, label %code_block2.lr.ph, label %return

code_block2.lr.ph: ; preds = %block_code
  %8 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 0
  %9 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 1
  %10 = getelementptr inbounds %struct.dsp_mydsp* %dsp, i64 0, i32 2
  br label %code_block2

code_block2: ; preds = %exit_block6, %code_block2.lr.ph
  %next_index95 = phi i32 [ 0, %code_block2.lr.ph ], [ %next_index9, %exit_block6 ]
  %11 = load float** %8
  %12 = load float** %9
  %13 = load float** %10
  %14 = sub i32 %fullcount, %next_index95
  %15 = icmp sgt i32 %14, 32
  %16 = select i1 %15, i32 32, i32 %14
  %17 = icmp sgt i32 %16, 0
  br i1 %17, label %code_block8, label %exit_block6

exit_block6: ; preds = %code_block8, %code_block2
  %next_index9 = add i32 %next_index95, 32
  %18 = icmp slt i32 %next_index9, %fullcount
  br i1 %18, label %code_block2, label %return

code_block8: ; preds = %code_block2, %code_block8
  %next_index3 = phi i32 [ %next_index, %code_block8 ], [ 0, %code_block2 ]
  %.sum = add i32 %next_index95, %next_index3
  %19 = getelementptr inbounds float* %13, i32 %.sum
  %.sum8 = add i32 %next_index95, %next_index3
  %20 = getelementptr inbounds float* %11, i32 %.sum8
  %21 = load float* %20
  %22 = getelementptr inbounds float* %12, i32 %.sum
  %23 = load float* %22
  %24 = fadd float %21, %23
  store float %24, float* %19
  %next_index = add i32 %next_index3, 1
  %25 = icmp slt i32 %next_index, %16
  br i1 %25, label %code_block8, label %exit_block6

return: ; preds = %exit_block6, %block_code
  ret void
}

Any idea what is wrong then?

Thanks

Stéphane Letz

I did some progress:

1) adding a DataLayout in my generated LLVM Module, explicitly as a string. BTW is there any notion of "default" DataLayout that could be used? How is a LLVM Module supposed to know which DataLayout to use (in general) ?

2) next the resulting module could not be vectorized with "opt -O3 -vectorize-loops -debug -S m1.ll -o m2.ll", but if I do in "two steps" like:

opt -O3 -vectorize-loops -debug S m1.ll -o m2.ll

opt -O3 -vectorize-loops -debug S m2.ll -o m3.ll

then it works….

Any idea?

Thanks.

Stéphane Letz

Hi Stephane,

Move the alloca for “i" into the entry block.

The IR coming into the loop vectorizer looks something like the following. The loop vectorizer can't recognize one of the phis as an induction or reduction, so it gives up.

The reason why you have this “odd” phi is because SROA (which transforms allocas into SSA variables) does not get rid of the “i” variable (later passes do but leave this odd IR around) because “i”’s alloca is not in the entry block - it only works on allocas in the entry block.

opt -O3 -vectorize-loops -debug-only=loop-vectorize < test.ll

LV: Found a loop: code_block8
LV: Found an induction variable.
LV: PHI is not a poly recurrence.
LV: Found an unidentified PHI. %storemerge8 = phi i32 [ 0, %code_block8.lr.ph ], [ %next_index, %code_block8 ]
LV: Can't vectorize the instructions or CFG
LV: Not vectorizing.

IR coming into the vectorizer:

code_block8: ; preds = %code_block8.lr.ph, %code_block8
  %next_index10 = phi i32 [ %i.promoted, %code_block8.lr.ph ], [ %next_index, %code_block8 ]
  %storemerge8 = phi i32 [ 0, %code_block8.lr.ph ], [ %next_index, %code_block8 ] ; <<< THIS phi is the problem.
  %20 = sext i32 %storemerge8 to i64
  %.sum = add i64 %20, %9
  %21 = getelementptr inbounds float* %11, i64 %.sum
  %22 = getelementptr inbounds float* %8, i64 %.sum
  %23 = load float* %22, align 4
  %24 = getelementptr inbounds float* %10, i64 %.sum
  %25 = load float* %24, align 4
  %26 = fadd float %23, %25
  store float %26, float* %21, align 4
  %next_index = add i32 %next_index10, 1
  %27 = icmp slt i32 %next_index, %16
  br i1 %27, label %code_block8, label %exec_block4.exit_block6_crit_edge

exec_block.return_crit_edge: ; preds = %exit_block6
  br label %return

return: ; preds = %exec_block.return_crit_edge, %block_code
  ret void
}

1) "entry" block is the first block of the function right?

2) do you mean *all* "alloca" in a function always have to be in the fist entry block?

Thanks.

Stéphane

1) "entry" block is the first block of the function right?

Yes.

2) do you mean *all* "alloca" in a function always have to be in the fist entry block?

If you want them converted into ssa variables early on, yes.

1) "entry" block is the first block of the function right?

Yes.

OK

2) do you mean *all* "alloca" in a function always have to be in the fist entry block?

If you want them converted into ssa variables early on, yes.

It this documented somewhere?

Thanks.

Stéphane

Sort of, here:

http://llvm.org/docs/tutorial/LangImpl7.html#memory-in-llvm

"The mem2reg pass implements the standard “iterated dominance frontier” algorithm for constructing SSA form and has a number of optimizations that speed up (very common) degenerate cases. The mem2reg optimization pass is the answer to dealing with mutable variables, and we highly recommend that you depend on it. Note that mem2reg only works on variables in certain circumstances:

  • mem2reg is alloca-driven: it looks for allocas and if it can handle them, it promotes them. It does not apply to global variables or heap allocations.
  • mem2reg only looks for alloca instructions in the entry block of the function. Being in the entry block guarantees that the alloca is only executed once, which makes analysis simpler.”