simplifycfg not happening?

The following function compiles with -O3 into the following IR. http://llvm.org/docs/Passes.html#simplifycfg-simplify-the-cfg says

  • Eliminates a basic block that only contains an unconditional branch.
    but the first and third blocks in the compiled function only contain an unconditional branch; I would have expected them to be eliminated. What am I missing?

double f(double *a) {
for (int i = 0; i < 1000; i++)
a[i] *= 2;
for (int i = 0; i < 1000; i++)
a[i] *= 2;
return a[0] + a[1];
}

; Function Attrs: nounwind uwtable
define double @“\01?f@@YANPEAN@Z”(double* nocapture %a) #1 {
overflow.checked:
br label %vector.body, !dbg !18

vector.body: ; preds = %vector.body, %overflow.checked
%index = phi i64 [ 0, %overflow.checked ], [ %index.next.1, %vector.body ], !dbg !18
%0 = getelementptr inbounds double, double* %a, i64 %index, !dbg !19
%1 = bitcast double* %0 to <2 x double>, !dbg !20
%wide.load = load <2 x double>, <2 x double>
%1, align 8, !dbg !20
%2 = getelementptr double, double* %0, i64 2, !dbg !20
%3 = bitcast double* %2 to <2 x double>, !dbg !20
%wide.load8 = load <2 x double>, <2 x double>
%3, align 8, !dbg !20
%4 = fmul <2 x double> %wide.load, <double 2.000000e+00, double 2.000000e+00>, !dbg !20
%5 = fmul <2 x double> %wide.load8, <double 2.000000e+00, double 2.000000e+00>, !dbg !20
%6 = bitcast double* %0 to <2 x double>, !dbg !20
store <2 x double> %4, <2 x double>
%6, align 8, !dbg !20
%7 = bitcast double* %2 to <2 x double>, !dbg !20
store <2 x double> %5, <2 x double>
%7, align 8, !dbg !20
%index.next = or i64 %index, 4, !dbg !18
%8 = getelementptr inbounds double, double* %a, i64 %index.next, !dbg !19
%9 = bitcast double* %8 to <2 x double>, !dbg !20
%wide.load.1 = load <2 x double>, <2 x double>
%9, align 8, !dbg !20
%10 = getelementptr double, double* %8, i64 2, !dbg !20
%11 = bitcast double* %10 to <2 x double>, !dbg !20
%wide.load8.1 = load <2 x double>, <2 x double>
%11, align 8, !dbg !20
%12 = fmul <2 x double> %wide.load.1, <double 2.000000e+00, double 2.000000e+00>, !dbg !20
%13 = fmul <2 x double> %wide.load8.1, <double 2.000000e+00, double 2.000000e+00>, !dbg !20
%14 = bitcast double* %8 to <2 x double>, !dbg !20
store <2 x double> %12, <2 x double>
%14, align 8, !dbg !20
%15 = bitcast double* %10 to <2 x double>, !dbg !20
store <2 x double> %13, <2 x double>
%15, align 8, !dbg !20
%index.next.1 = add nsw i64 %index, 8, !dbg !18
%16 = icmp eq i64 %index.next.1, 1000, !dbg !18
br i1 %16, label %vector.body10.preheader, label %vector.body, !dbg !18, !llvm.loop !21

vector.body10.preheader: ; preds = %vector.body
br label %vector.body10, !dbg !24

vector.body10: ; preds = %vector.body10, %vector.body10.preheader
%index13 = phi i64 [ 0, %vector.body10.preheader ], [ %index.next21.1, %vector.body10 ], !dbg !25
%17 = getelementptr inbounds double, double* %a, i64 %index13, !dbg !24
%18 = bitcast double* %17 to <2 x double>, !dbg !26
%wide.load26 = load <2 x double>, <2 x double>
%18, align 8, !dbg !26
%19 = getelementptr double, double* %17, i64 2, !dbg !26
%20 = bitcast double* %19 to <2 x double>, !dbg !26
%wide.load27 = load <2 x double>, <2 x double>
%20, align 8, !dbg !26
%21 = fmul <2 x double> %wide.load26, <double 2.000000e+00, double 2.000000e+00>, !dbg !26
%22 = fmul <2 x double> %wide.load27, <double 2.000000e+00, double 2.000000e+00>, !dbg !26
%23 = bitcast double* %17 to <2 x double>, !dbg !26
store <2 x double> %21, <2 x double>
%23, align 8, !dbg !26
%24 = bitcast double* %19 to <2 x double>, !dbg !26
store <2 x double> %22, <2 x double>
%24, align 8, !dbg !26
%index.next21 = or i64 %index13, 4, !dbg !25
%25 = getelementptr inbounds double, double* %a, i64 %index.next21, !dbg !24
%26 = bitcast double* %25 to <2 x double>, !dbg !26
%wide.load26.1 = load <2 x double>, <2 x double>
%26, align 8, !dbg !26
%27 = getelementptr double, double* %25, i64 2, !dbg !26
%28 = bitcast double* %27 to <2 x double>, !dbg !26
%wide.load27.1 = load <2 x double>, <2 x double>
%28, align 8, !dbg !26
%29 = fmul <2 x double> %wide.load26.1, <double 2.000000e+00, double 2.000000e+00>, !dbg !26
%30 = fmul <2 x double> %wide.load27.1, <double 2.000000e+00, double 2.000000e+00>, !dbg !26
%31 = bitcast double* %25 to <2 x double>, !dbg !26
store <2 x double> %29, <2 x double>
%31, align 8, !dbg !26
%32 = bitcast double* %27 to <2 x double>, !dbg !26
store <2 x double> %30, <2 x double>
%32, align 8, !dbg !26
%index.next21.1 = add nsw i64 %index13, 8, !dbg !25
%33 = icmp eq i64 %index.next21.1, 1000, !dbg !25
br i1 %33, label %middle.block11, label %vector.body10, !dbg !25, !llvm.loop !27

middle.block11: ; preds = %vector.body10
%34 = load double, double* %a, align 8, !dbg !28
%35 = getelementptr inbounds double, double* %a, i64 1, !dbg !29
%36 = load double, double* %35, align 8, !dbg !29
%37 = fadd double %34, %36, !dbg !30
ret double %37, !dbg !31
}

Hi,

if you use opt -simplifycfg, the third BB can be eliminated.

You’re right, it can indeed.

Is there a reason -O3 doesn’t do this? I had been expecting -O3 to perform full optimization.

The first block still remains in any case. Is the first block needed for some purpose I’m not taking into account?

Oh wait, this is because there cannot be any branches to the first block in
a function?

I’m not sure the first bb. Maybe the label name explains itself? My guess is that there is some fast overflow check happening before this function can be executed? Just a guess. Hope other people can help you.