complex branching generation

LLVM seems to be generating way too complex of branching based on the
short-circuit optimization. The code in question is as follows:

define void @ test_fc_while_and(float %x, float %y, float addrspace(11)*
%result) nounwind {

entry:

        %tobool3 = fcmp une float %x, 0.000000e+000 ; <i1>
[#uses=1]

        %tobool24 = fcmp une float %y, 0.000000e+000 ; <i1>
[#uses=2]

        %or.cond5 = and i1 %tobool3, %tobool24 ; <i1> [#uses=1]

        br i1 %or.cond5, label %bb.nph, label %whileexit

bb.nph: ; preds = %entry

        br i1 %tobool24, label %whilebody.us, label %whilebody

whilebody.us: ; preds = %whilebody.us, %bb.nph

...code here...

        br i1 %phitmp, label %whilebody.us, label %whileexit

whilebody: ; preds = %bb.nph

...code here...

        br label %whileexit

whileexit: ; preds = %whilebody, %whilebody.us, %entry

        %z.0.lcssa = phi float [ 0.000000e+000, %entry ], [ %add,
%whilebody ], [ %add.us, %whilebody.us ] ; <float>
[#uses=1]

        store float %z.0.lcssa, float addrspace(11)* %result

        ret void

}

based on original code of:

void test_fc_while_and(float x, float y, float* result)

{

        float z = (float)0;

        while (x && y) {

        z += (x * y);

        ++x

        }

        *result = z;

}

Now the problem issue is with the bolded code. The two comparisons and
the and instruction that are mapped to the bolded while statement. What
I am trying to figure out why the bb.nph branch is even required and how
do I disable it from being generated? The first branch instruction
handles correctly the condition that I wanted, so there should be no
reason that bb.nph is generated. The same for whilebody, as it shouldn't
be there.

Any ideas?

Thanks,

Micah Villmow

Systems Engineer

Advanced Technology & Performance

Advanced Micro Devices Inc.

4555 Great America Pkwy,

Santa Clara, CA. 95054

P: 408-572-6219

F: 408-572-6596

What optimization level are you running at? Here's what I get at -Os:

llvm-gcc -S -emit-llvm -o - a.c -Os
; ModuleID = 'a.c'
target datalayout =
"e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i386-apple-darwin9.5"

define void @test_fc_while_and(float %x, float %y, float* %result)
nounwind optsize {
entry:
  %.not7 = fcmp une float %x, 0.000000e+00 ; <i1> [#uses=1]
  %0 = fcmp une float %y, 0.000000e+00 ; <i1> [#uses=2]
  %or.cond8 = and i1 %.not7, %0 ; <i1> [#uses=1]
  br i1 %or.cond8, label %bb, label %bb4

bb: ; preds = %bb, %entry
  %x_addr.06 = phi float [ %x, %entry ], [ %3, %bb ] ; <float> [#uses=2]
  %z.05 = phi float [ 0.000000e+00, %entry ], [ %2, %bb ] ; <float> [#uses=1]
  %1 = mul float %x_addr.06, %y ; <float> [#uses=1]
  %2 = add float %z.05, %1 ; <float> [#uses=2]
  %3 = add float %x_addr.06, 1.000000e+00 ; <float> [#uses=2]
  %phitmp = fcmp une float %3, 0.000000e+00 ; <i1> [#uses=1]
  %or.cond = and i1 %phitmp, %0 ; <i1> [#uses=1]
  br i1 %or.cond, label %bb, label %bb4

bb4: ; preds = %bb, %entry
  %z.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %2, %bb ] ;
<float> [#uses=1]
  store float %z.0.lcssa, float* %result, align 4
  ret void
}

-Os runs these passes more than -O2: -domfrontier -lcssa -loop-unroll

-bw

Are you sure? That looks wrong; I don't think -Os should be doing loop unrolling. (Perhaps it should be on at -O2, as well. gcc's loop unrolling is not turned on by -Oanything, you have to use the switch explicitly, and that's how it's documented. But I think that's because its loop unrolling heuristics aren't all that good, and I don't think we should feel constrained to follow this if ours shows a win at -O2.)

Strike that. Reverse it. It's -O2 that does those, not -Os. Sorry for
the confusion. :slight_smile:

-bw