Hello,
With the following test case:
__attribute__((noinline))
int test(int mask, int *count) {
for (int i = 0, e = __builtin_popcount(mask); i < e; ++i) {
++(*count);
}
return 1;
}
int main() {
int mask = 0, count = 0;
test(mask, &count);
}
The instcombine
pass performs the following transform on the entry block:
Before:
%0 = tail call i32 @llvm.ctpop.i32(i32 %mask), !dbg !10, !range !11
%cmp9 = icmp ult i32 0, %0, !dbg !10
br i1 %cmp9, label %for.body.lr.ph, label %for.cond.cleanup, !dbg !12
After:
%0 = tail call i32 @llvm.ctpop.i32(i32 %mask), !dbg !10, !range !11
%cmp9.not = icmp eq i32 %mask, 0, !dbg !10
br i1 %cmp9.not, label %for.cond.cleanup, label %for.body.lr.ph, !dbg !12
Following this transformation, the indvars
pass causes a redundant umax
to be inserted in for.body.lr.ph
:
define dso_local i32 @test(i32 %mask, i32* nocapture %count) local_unnamed_addr #0 !dbg !7 {
entry:
%0 = tail call i32 @llvm.ctpop.i32(i32 %mask), !dbg !10, !range !11
%cmp9.not = icmp eq i32 %mask, 0, !dbg !10
br i1 %cmp9.not, label %for.cond.cleanup, label %for.body.lr.ph, !dbg !12
for.body.lr.ph: ; preds = %entry
%count.promoted = load i32, i32* %count, align 4, !tbaa !13
%1 = icmp ugt i32 %0, 1, !dbg !12
%umax = select i1 %1, i32 %0, i32 1, !dbg !12
br label %for.body, !dbg !12
for.cond.for.cond.cleanup_crit_edge: ; preds = %for.body
%2 = add i32 %count.promoted, %umax, !dbg !12
store i32 %2, i32* %count, align 4, !dbg !17, !tbaa !13
br label %for.cond.cleanup, !dbg !12
for.cond.cleanup: ; preds = %entry, %for.cond.for.cond.cleanup_crit_edge
ret i32 1, !dbg !18
for.body: ; preds = %for.body.lr.ph, %for.body
%_mskCopy.010 = phi i32 [ %mask, %for.body.lr.ph ], [ %and, %for.body ]
%3 = tail call i32 @llvm.ctlz.i32(i32 %_mskCopy.010, i1 false), !dbg !10, !range !11
%shl = shl nuw i32 1, %3, !dbg !10
%neg = xor i32 %shl, -1, !dbg !10
%and = and i32 %_mskCopy.010, %neg, !dbg !10
br i1 false, label %for.body, label %for.cond.for.cond.cleanup_crit_edge, !dbg !12, !llvm.loop !19
}
The approach so far has been to modify ScalarEvolution::createSCEV
in the following way:
// min(max(u(x), 1), x)
case Intrinsic::ctpop: {
Type *TypeI32 = IntegerType::getInt32Ty(II->getContext());
const SCEV *One = getSCEV(ConstantInt::get(TypeI32, 1, false));
const SCEV *U = getUnknown(V);
const SCEV *LHS = getUMaxExpr(U, One);
const SCEV *RHS = getSCEV(II->getArgOperand(0));
return getUMinExpr(LHS, RHS);
}
With this modification ScalarEvolution::isLoopEntryGuardedByCond
still returns false for the case above, causing the redundant umax insertion.
What is the recommended approach here? Is there a simple way to fix this in SCEV, or would an update to instcombine
be an easier alternative?
Thanks,
Garrett