LoopStrengthReduction generates false code

Hi.

In my backend I get false code after using StrengthLoopReduction. In the generated code the loop index variable is multiplied by 8 (correct, everything is 64 bit aligned) to get an address offset, and the index variable is incremented by 1*8, which is not correct. It should be incremented by 1 only. The factor 8 appears again.

I compared the debug output (-debug-only=scalar-evolution,loop-reduce) for my backend and the ARM backend, but simply can't read/understand it. They differ in factors 4 vs 8 (ok), but there are more differences, probably caused by the implementation of TargetTransformInfo for ARM, while I haven't implemented it for my arch, yet.

How can I debug this further? In my arch everything is 64 bit aligned (factor 8 in many passes) and the memory is not byte addressed.

Thanks,
Boris

----8<----

LLVM assembly:

@buffer = common dso_local global [10 x i32] zeroinitializer, align 4

; Function Attrs: nounwind
define dso_local void @some_main(i32* %result) local_unnamed_addr #0 {
entry:
  tail call void @fill_array(i32* getelementptr inbounds ([10 x i32], [10 x i32]* @buffer, i32 0, i32 0)) #2
  br label %while.body

while.body: ; preds = %entry, %while.body
  %i.010 = phi i32 [ 0, %entry ], [ %inc, %while.body ]
  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* @buffer, i32 0, i32 %i.010
  %0 = load i32, i32* %arrayidx, align 4, !tbaa !2
  %cmp1 = icmp ne i32 %0, -559038737
  %inc = add nuw nsw i32 %i.010, 1
  %cmp11 = icmp eq i32 %i.010, 0
  %cmp = or i1 %cmp11, %cmp1
  br i1 %cmp, label %while.body, label %while.end

while.end: ; preds = %while.body
  %arrayidx2 = getelementptr inbounds [10 x i32], [10 x i32]* @buffer, i32 0, i32 %i.010
  %1 = load i32, i32* %arrayidx2, align 4, !tbaa !2
  store volatile i32 %1, i32* %result, align 4, !tbaa !2
  ret void
}

declare dso_local void @fill_array(i32*) local_unnamed_addr #1

attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind }

!llvm.module.flags = !{!0}
!llvm.ident = !{!1}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 7.0.1 (tags/RELEASE_701/final)"}
!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C/C++ TBAA"}

(-debug-only=scalar-evolution,loop-reduce) for my arch:

LSR on loop %while.body:
Collecting IV Chains.
IV Chain#0 Head: ( %0 = load i32, i32* %arrayidx, align 4, !tbaa !2) IV={@buffer,+,8}<nsw><%while.body>
IV Chain#1 Head: ( %cmp11 = icmp eq i32 %i.010, 0) IV={0,+,1}<nuw><nsw><%while.body>
IV Chain#1 Inc: ( %i.010 = phi i32 [ 0, %entry ], [ %inc, %while.body ]) IV+1
Chain: %cmp11 = icmp eq i32 %i.010, 0 Cost: 0
LSR has identified the following interesting factors and types: *8
LSR is examining the following fixup sites:
  UserInst=%cmp11, OperandValToReplace=%i.010
  UserInst=%0, OperandValToReplace=%arrayidx
LSR found 2 uses:
LSR is examining the following uses:
  LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i32
    reg({0,+,-1}<nw><%while.body>)
  LSR Use: Kind=Address of i32 in addrspace(0), Offsets={0}, widest fixup type: i32*
    reg({@buffer,+,8}<nsw><%while.body>)

After generating reuse formulae:
LSR is examining the following uses:
  LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i32
    reg({0,+,-1}<nw><%while.body>)
    reg({0,+,8}<nuw><nsw><%while.body>)
    reg({0,+,1}<nuw><nsw><%while.body>)
  LSR Use: Kind=Address of i32 in addrspace(0), Offsets={0}, widest fixup type: i32*
    reg({@buffer,+,8}<nsw><%while.body>)
    reg(@buffer) + 1*reg({0,+,8}<nuw><nsw><%while.body>)
Filtering for use LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i32
  Filtering out formula reg({0,+,1}<nuw><nsw><%while.body>)
    in favor of formula reg({0,+,-1}<nw><%while.body>)
Filtering for use LSR Use: Kind=Address of i32 in addrspace(0), Offsets={0}, widest fixup type: i32*

After filtering out undesirable candidates:
LSR is examining the following uses:
  LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i32
    reg({0,+,-1}<nw><%while.body>)
    reg({0,+,8}<nuw><nsw><%while.body>)
  LSR Use: Kind=Address of i32 in addrspace(0), Offsets={0}, widest fixup type: i32*
    reg({@buffer,+,8}<nsw><%while.body>)
    reg(@buffer) + 1*reg({0,+,8}<nuw><nsw><%while.body>)
New best at 2 instructions 2 regs, with addrec cost 2.
Regs: {0,+,-1}<nw><%while.body> {@buffer,+,8}<nsw><%while.body>
New best at 2 instructions 2 regs, with addrec cost 1, plus 1 base add.
Regs: {0,+,8}<nuw><nsw><%while.body> @buffer

The chosen solution requires 2 instructions 2 regs, with addrec cost 1, plus 1 base add:
  LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i32
    reg({0,+,8}<nuw><nsw><%while.body>)
  LSR Use: Kind=Address of i32 in addrspace(0), Offsets={0}, widest fixup type: i32*
    reg(@buffer) + 1*reg({0,+,8}<nuw><nsw><%while.body>)

(-debug-only=scalar-evolution,loop-reduce) for ARM:

LSR on loop %while.body:
Collecting IV Chains.
IV Chain#0 Head: ( %0 = load i32, i32* %arrayidx, align 4, !tbaa !2) IV={@buffer,+,4}<nsw><%while.body>
IV Chain#1 Head: ( %cmp11 = icmp eq i32 %i.010, 0) IV={0,+,1}<nuw><nsw><%while.body>
IV Chain#1 Inc: ( %i.010 = phi i32 [ 0, %entry ], [ %inc, %while.body ]) IV+1
Chain: %cmp11 = icmp eq i32 %i.010, 0 Cost: 0
LSR has identified the following interesting factors and types: *4
LSR is examining the following fixup sites:
  UserInst=%cmp11, OperandValToReplace=%i.010
  UserInst=%0, OperandValToReplace=%arrayidx
LSR found 2 uses:
LSR is examining the following uses:
  LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i32
    reg({0,+,-1}<nw><%while.body>)
  LSR Use: Kind=Address of i32 in addrspace(0), Offsets={0}, widest fixup type: i32*
    reg({@buffer,+,4}<nsw><%while.body>)

After generating reuse formulae:
LSR is examining the following uses:
  LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i32
    reg({0,+,-1}<nw><%while.body>)
    reg({0,+,4}<nuw><nsw><%while.body>)
    reg({0,+,1}<nuw><nsw><%while.body>)
  LSR Use: Kind=Address of i32 in addrspace(0), Offsets={0}, widest fixup type: i32*
    reg({@buffer,+,4}<nsw><%while.body>)
    reg(@buffer) + 1*reg({0,+,4}<nuw><nsw><%while.body>)
    -1*reg({(-1 * @buffer),+,-4}<nw><%while.body>)
    reg(@buffer) + 4*reg({0,+,1}<nuw><nsw><%while.body>)
    reg(@buffer) + -4*reg({0,+,-1}<nw><%while.body>)
    reg(@buffer) + -1*reg({0,+,-4}<nw><%while.body>)
Filtering for use LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i32
Filtering for use LSR Use: Kind=Address of i32 in addrspace(0), Offsets={0}, widest fixup type: i32*
  Filtering out formula -1*reg({(-1 * @buffer),+,-4}<nw><%while.body>)
    in favor of formula reg({@buffer,+,4}<nsw><%while.body>)
  Filtering out formula reg(@buffer) + -1*reg({0,+,-4}<nw><%while.body>)
    in favor of formula reg({@buffer,+,4}<nsw><%while.body>)

After filtering out undesirable candidates:
LSR is examining the following uses:
  LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i32
    reg({0,+,-1}<nw><%while.body>)
    reg({0,+,4}<nuw><nsw><%while.body>)
    reg({0,+,1}<nuw><nsw><%while.body>)
  LSR Use: Kind=Address of i32 in addrspace(0), Offsets={0}, widest fixup type: i32*
    reg({@buffer,+,4}<nsw><%while.body>)
    reg(@buffer) + 1*reg({0,+,4}<nuw><nsw><%while.body>)
    reg(@buffer) + -4*reg({0,+,-1}<nw><%while.body>)
    reg(@buffer) + 4*reg({0,+,1}<nuw><nsw><%while.body>)
New best at 1 instruction 2 regs, with addrec cost 1.
Regs: {0,+,-1}<nw><%while.body> @buffer

The chosen solution requires 1 instruction 2 regs, with addrec cost 1:
  LSR Use: Kind=ICmpZero, Offsets={0}, widest fixup type: i32
    reg({0,+,-1}<nw><%while.body>)
  LSR Use: Kind=Address of i32 in addrspace(0), Offsets={0}, widest fixup type: i32*
    reg(@buffer) + -4*reg({0,+,-1}<nw><%while.body>)

Blindly guessing here, "memory is not byte addressed", but you never fixed ScalarEvolution to handle that, so it's modeling the GEP in a way you're not expecting.

-Eli

Hm, no. I expect byte addresses - everywhere. The compiler should not know that the arch needs word addresses. During lowering LOAD and STORE get explicit conversion operations for the memory address. Even if my arch was byte addressed the code would be false/illegal.

Boris

Hmm. Then I'm not sure; at first glance, the debug output looks fine either way. Could you show the IR after LSR, and explain why it's wrong?

-Eli

The IR after LSR is:

*** IR Dump After Loop Strength Reduction ***
; Preheader:
entry:
  tail call void @fill_array(i32* getelementptr inbounds ([10 x i32], [10 x i32]* @buffer, i32 0, i32 0)) #2
  br label %while.body

; Loop:
while.body: ; preds = %while.body, %entry
  %lsr.iv = phi i32 [ %lsr.iv.next, %while.body ], [ 0, %entry ]
  %uglygep = getelementptr i8, i8* bitcast ([10 x i32]* @buffer to i8*), i32 %lsr.iv
  %uglygep1 = bitcast i8* %uglygep to i32*
  %0 = load i32, i32* %uglygep1, align 4, !tbaa !2
  %cmp1 = icmp ne i32 %0, -559038737
  %cmp11 = icmp eq i32 %lsr.iv, 0
  %cmp = or i1 %cmp11, %cmp1
  %lsr.iv.next = add nuw i32 %lsr.iv, 8
  br i1 %cmp, label %while.body, label %while.end

; Exit blocks
while.end: ; preds = %while.body
  store volatile i32 %0, i32* %result, align 4, !tbaa !2
  ret void

I guess "%uglygep = getelementptr.." will be lowered to @buffer + (%lsr.iv * StoreSize(i32)). That's what I see in the final code. But then %lsr.iv.next should be incremented by 1; BUT it is incremented by 8.

Incrementing %lsr.iv.next by 8 would make sense if getelementptr were lowered to @buffer + %lsr.iv.

Thanks for your help,
Boris

" getelementptr i8" is a GEP over byte-size elements, so it is in fact just "@buffer + %lsr.iv". Note we bitcast the operand to i8*, then bitcast the result from i8* to i32*.

-Eli

Sorry, no.

First, I made some small changes to my backend and the code is slighly different. But I think it is irrelevant, the while loop is unchanged, but the end block is different. (generated IR code below)

I added some debug printing in SelectionDAGBuilder::visitGetElementPtr() and the 3 GEPs are lowered to two base + %lsr.iv * 8 and one base + offset.

Also, if I align i32 to 32 bits (which is illegal on this arch!), then the expected code is generated.

I'll have a closer look at SelectionDAGBuilder::visitGetElementPtr().

Boris

new code:

@buffer = common dso_local global [10 x i32] zeroinitializer, align 4

; Function Attrs: nounwind
define dso_local void @some_main(i32* %result) local_unnamed_addr #0 {
entry:
  tail call void @fill_array(i32* getelementptr inbounds ([10 x i32], [10 x i32]* @buffer, i32 0, i32 0)) #2
  br label %while.body

while.body: ; preds = %while.body, %entry
  %i.010 = phi i32 [ 0, %entry ], [ %inc, %while.body ]
  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* @buffer, i32 0, i32 %i.010
  %0 = load i32, i32* %arrayidx, align 4, !tbaa !2
  %cmp1 = icmp ne i32 %0, -559038737
  %inc = add nuw nsw i32 %i.010, 1
  %cmp11 = icmp eq i32 %i.010, 0
  %cmp = or i1 %cmp11, %cmp1
  br i1 %cmp, label %while.body, label %while.end

while.end: ; preds = %while.body
  %arrayidx2 = getelementptr inbounds [10 x i32], [10 x i32]* @buffer, i32 0, i32 %i.010
  %1 = load i32, i32* %arrayidx2, align 4, !tbaa !2
  store volatile i32 %1, i32* %result, align 4, !tbaa !2
  ret void
}

declare dso_local void @fill_array(i32*) local_unnamed_addr #1

*** IR Dump After Loop Strength Reduction ***
; Preheader:
entry:
  tail call void @fill_array(i32* getelementptr inbounds ([10 x i32], [10 x i32]* @buffer, i32 0, i32 0)) #2
  br label %while.body

; Loop:
while.body: ; preds = %while.body, %entry
  %lsr.iv = phi i32 [ %lsr.iv.next, %while.body ], [ 0, %entry ]
  %uglygep2 = getelementptr i8, i8* bitcast ([10 x i32]* @buffer to i8*), i32 %lsr.iv
  %uglygep23 = bitcast i8* %uglygep2 to i32*
  %0 = load i32, i32* %uglygep23, align 4, !tbaa !2
  %cmp1 = icmp ne i32 %0, -559038737
  %cmp11 = icmp eq i32 %lsr.iv, 0
  %cmp = or i1 %cmp11, %cmp1
  %lsr.iv.next = add nuw i32 %lsr.iv, 8
  br i1 %cmp, label %while.body, label %while.end

; Exit blocks
while.end: ; preds = %while.body
  %uglygep = getelementptr i8, i8* bitcast ([10 x i32]* @buffer to i8*), i32 %lsr.iv.next
  %uglygep1 = bitcast i8* %uglygep to i32*
  %scevgep = getelementptr i32, i32* %uglygep1, i32 -1
  %1 = load i32, i32* %scevgep, align 4, !tbaa !2
  store volatile i32 %1, i32* %result, align 4, !tbaa !2
  ret void

Ok, I got it. By bitcasting to i8 and *i8 and by assuming i8 is aligned to 8 bits, getelementpointer assumes it has a byte addressed memory.

Thanks,
Boris