When evaluating loop peeling opportunities in llvm::computePeelCount
, the countToEliminateCompares
function computes “the number of iterations to peel off that make conditions in the body true/false”. However, the function does not in fact exactly do this, possibly leading to what looks like suboptimal peeling decisions that increase code size without reducing the number of conditions evaluated during execution.
Consider the following toy example.
void example(int mem[16][16], bool cond) {
for (int i = 0 ; i < 16 ; ++i) {
for (int j = i ; j < 16 ; ++j) {
if (cond && j > i) {
mem[i][j] = 0;
}
}
}
}
It is immediately obvious that, at every iteration of the outer loop, j > i
is only false on the first iteration of the inner loop, which countToEliminateCompares
identifies correctly. However, this does not mean that the if’s then branch can be inlined inside the nested loop if the first inner loop iteration were to be peeled; the value of cond
remains unknown, making the if’s condition unknown as well. Despite this fact, countToEliminateCompares
still returns 1, causing the first inner loop iteration to be peeled as part of the general unrolling pass.
This is an IR representing the same program.
; example.ll
define void @example(ptr noundef %mem, i1 noundef zeroext %cond) {
entry:
br label %outer.header
outer.header: ; preds = %entry, %outer.latch_exiting
%outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
br label %inner.header
inner.header: ; preds = %outer.header, %inner.latch_exiting
%inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.latch_exiting ]
%not_first_it = icmp ugt i32 %inner.iv, %outer.iv
%inner.ifcond = and i1 %cond, %not_first_it
br i1 %inner.ifcond, label %inner.if, label %inner.latch_exiting
inner.if: ; preds = %inner.header
%outer.iv.ext = zext nneg i32 %outer.iv to i64
%idx_part = mul nuw nsw i64 %outer.iv.ext, 16
%inner.iv.ext = zext nneg i32 %inner.iv to i64
%idx = add nuw nsw i64 %idx_part, %inner.iv.ext
%addr = getelementptr inbounds [16 x [16 x i32]], ptr %mem, i64 %idx
store i32 0, ptr %addr, align 4
br label %inner.latch_exiting
inner.latch_exiting: ; preds = %inner.header, %inner.if
%inner.iv_next = add nuw nsw i32 %inner.iv, 1
%inner.cond = icmp ult i32 %inner.iv_next, 16
br i1 %inner.cond, label %inner.header, label %outer.latch_exiting
outer.latch_exiting: ; preds = %inner.latch_exiting
%outer.iv_next = add nuw nsw i32 %outer.iv, 1
%outer.cond = icmp ult i32 %outer.iv_next, 16
br i1 %outer.cond, label %outer.header, label %end
end: ; preds = %outer.header
ret void
}
This is the result of running the loop-unroll pass on that IR.
; opt example.ll -S -passes=loop-unroll
define void @example(ptr noundef %mem, i1 noundef zeroext %cond) {
entry:
br label %outer.header
outer.header: ; preds = %outer.latch_exiting, %entry
%outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
br label %inner.header.peel.begin
inner.header.peel.begin: ; preds = %outer.header
br label %inner.header.peel
inner.header.peel: ; preds = %inner.header.peel.begin
%not_first_it.peel = icmp ugt i32 %outer.iv, %outer.iv
%inner.ifcond.peel = and i1 %cond, %not_first_it.peel
br i1 %inner.ifcond.peel, label %inner.if.peel, label %inner.latch_exiting.peel
inner.if.peel: ; preds = %inner.header.peel
%outer.iv.ext.peel = zext nneg i32 %outer.iv to i64
%idx_part.peel = mul nuw nsw i64 %outer.iv.ext.peel, 16
%inner.iv.ext.peel = zext nneg i32 %outer.iv to i64
%idx.peel = add nuw nsw i64 %idx_part.peel, %inner.iv.ext.peel
%addr.peel = getelementptr inbounds [16 x [16 x i32]], ptr %mem, i64 %idx.peel
store i32 0, ptr %addr.peel, align 4
br label %inner.latch_exiting.peel
inner.latch_exiting.peel: ; preds = %inner.if.peel, %inner.header.peel
%inner.iv_next.peel = add nuw nsw i32 %outer.iv, 1
%inner.cond.peel = icmp ult i32 %inner.iv_next.peel, 16
br i1 %inner.cond.peel, label %inner.header.peel.next, label %outer.latch_exiting
inner.header.peel.next: ; preds = %inner.latch_exiting.peel
br label %inner.header.peel.next1
inner.header.peel.next1: ; preds = %inner.header.peel.next
br label %outer.header.peel.newph
outer.header.peel.newph: ; preds = %inner.header.peel.next1
br label %inner.header
inner.header: ; preds = %inner.latch_exiting, %outer.header.peel.newph
%inner.iv = phi i32 [ %inner.iv_next.peel, %outer.header.peel.newph ], [ %inner.iv_next, %inner.latch_exiting ]
%not_first_it = icmp ugt i32 %inner.iv, %outer.iv
%inner.ifcond = and i1 %cond, %not_first_it
br i1 %inner.ifcond, label %inner.if, label %inner.latch_exiting
inner.if: ; preds = %inner.header
%outer.iv.ext = zext nneg i32 %outer.iv to i64
%idx_part = mul nuw nsw i64 %outer.iv.ext, 16
%inner.iv.ext = zext nneg i32 %inner.iv to i64
%idx = add nuw nsw i64 %idx_part, %inner.iv.ext
%addr = getelementptr inbounds [16 x [16 x i32]], ptr %mem, i64 %idx
store i32 0, ptr %addr, align 4
br label %inner.latch_exiting
inner.latch_exiting: ; preds = %inner.if, %inner.header
%inner.iv_next = add nuw nsw i32 %inner.iv, 1
%inner.cond = icmp ult i32 %inner.iv_next, 16
br i1 %inner.cond, label %inner.header, label %outer.latch_exiting.loopexit, !llvm.loop !0
outer.latch_exiting.loopexit: ; preds = %inner.latch_exiting
br label %outer.latch_exiting
outer.latch_exiting: ; preds = %outer.latch_exiting.loopexit, %inner.latch_exiting.peel
%outer.iv_next = add nuw nsw i32 %outer.iv, 1
%outer.cond = icmp ult i32 %outer.iv_next, 16
br i1 %outer.cond, label %outer.header, label %end
end: ; preds = %outer.latch_exiting
ret void
}
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.peeled.count", i32 1}
Despite the first iteration being peeled, the if (rightfully) remains inside the inner loop. This can become problematic when the loop-unroll pass then evaluates the outer loop; it may have been small enough to be fully unrolled before, but it is now twice as long and its unrolling cost may now be deemed too high.
My question/concern is the following.
Is countToEliminateCompares
’s comment simply slightly misleading (it did “eliminate” the j > i
comparison in that it can now be optimized away by a pass down the pipeline, but not the condition itself) and there is an underlying rationale for this behavior (perhaps doing this generally plays well with further optimizations)? Or is this a bug in the function which should in fact return 0 because there is no number of peeled iterations that can completely eliminate the condition in this case?