Hello LLVM developers,
I’ve got a lengthy question regarding MemorySSA / EarlyCSE behavior. I’m not sure if this is a bug or am I completely confused. Consider the following example:
struct foo {
unsigned a;
unsigned b;
};
unsigned g;
int fn1(struct foo *foo) {
if (foo->a == 1)
g++;
if (foo->a == 2)
return 1;
return 0;
}
int fn2(struct foo *foo) {
if (foo->b == 1) // second field, force getelementptr presence
g++;
if (foo->b == 2)
return 1;
return 0;
}
int fn3(int *foo) { // just an int pointer, should be similar foo->a
if (*foo == 1)
g++;
if (*foo == 2)
return 1;
return 0;
}
When compiled with the following flags:
clang -target bpf -S -O2 -emit-llvm -o t.ll t.c
I get the following IR:
; ModuleID = 't.c'
source_filename = "t.c"
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "bpf"
%struct.foo = type { i32, i32 }
@g = dso_local local_unnamed_addr global i32 0, align 4
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
define dso_local i32 @fn1(ptr nocapture noundef readonly %foo) local_unnamed_addr #0 {
entry:
%0 = load i32, ptr %foo, align 4, !tbaa !3
%cmp = icmp eq i32 %0, 1
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%1 = load i32, ptr @g, align 4, !tbaa !8
%inc = add i32 %1, 1
store i32 %inc, ptr @g, align 4, !tbaa !8
%.pre = load i32, ptr %foo, align 4, !tbaa !3
br label %if.end
if.end: ; preds = %if.then, %entry
%2 = phi i32 [ %.pre, %if.then ], [ %0, %entry ]
%cmp2 = icmp eq i32 %2, 2
%. = zext i1 %cmp2 to i32
ret i32 %.
}
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
define dso_local i32 @fn2(ptr nocapture noundef readonly %foo) local_unnamed_addr #0 {
entry:
%b = getelementptr inbounds %struct.foo, ptr %foo, i64 0, i32 1
%0 = load i32, ptr %b, align 4, !tbaa !9
%cmp = icmp eq i32 %0, 1
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%1 = load i32, ptr @g, align 4, !tbaa !8
%inc = add i32 %1, 1
store i32 %inc, ptr @g, align 4, !tbaa !8
br label %if.end
if.end: ; preds = %if.then, %entry
%cmp2 = icmp eq i32 %0, 2
%. = zext i1 %cmp2 to i32
ret i32 %.
}
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
define dso_local i32 @fn3(ptr nocapture noundef readonly %foo) local_unnamed_addr #0 {
entry:
%0 = load i32, ptr %foo, align 4, !tbaa !8
%cmp = icmp eq i32 %0, 1
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%1 = load i32, ptr @g, align 4, !tbaa !8
%inc = add i32 %1, 1
store i32 %inc, ptr @g, align 4, !tbaa !8
%.pre = load i32, ptr %foo, align 4, !tbaa !8
br label %if.end
if.end: ; preds = %if.then, %entry
%2 = phi i32 [ %.pre, %if.then ], [ %0, %entry ]
%cmp1 = icmp eq i32 %2, 2
%. = zext i1 %cmp1 to i32
ret i32 %.
}
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"frame-pointer", i32 2}
!2 = !{!"clang version 16.0.0 (https://github.com/llvm/llvm-project.git d4833e1623a0c5fd2b534a3d769e5399ac07b495)"}
!3 = !{!4, !5, i64 0}
!4 = !{!"foo", !5, i64 0, !5, i64 4}
!5 = !{!"int", !6, i64 0}
!6 = !{!"omnipotent char", !7, i64 0}
!7 = !{!"Simple C/C++ TBAA"}
!8 = !{!5, !5, i64 0}
!9 = !{!4, !5, i64 4}
Note the different amount of loads for parameter foo
:
-
fn1
–load %foo
2 times -
fn2
–load %b
(foo field) 1 time -
fn3
–load %foo
2 times
It seems to me that 2 should be a correct number of loads as parameter foo
does not have a restrict
qualifier and thus might alias g
(am I wrong?). In any case, the difference between fn1
and fn2
is
kind of unexpected.
While debugging this behavior I found out that the second load for fn2
is removed by EarlyCSE
pass that relies on MemorySSA
to get the aliasing information. Specifically EarlyCSE::isSameMemGeneration
relies on the aliasing information from MemorySSA.cpp:instructionClobbersQuery
function. I’ve modified the both functions to get additional info as follows:
// EarlyCSE.cpp
bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration,
unsigned LaterGeneration,
Instruction *EarlierInst,
Instruction *LaterInst) {
...
// Since we know LaterDef dominates LaterInst and EarlierInst dominates
// LaterInst, if LaterDef dominates EarlierInst then it can't occur between
// EarlierInst and LaterInst and neither can any other write that potentially
// clobbers LaterInst.
MemoryAccess *LaterDef;
if (ClobberCounter < EarlyCSEMssaOptCap) {
LaterDef = MSSA->getWalker()->getClobberingMemoryAccess(LaterInst);
ClobberCounter++;
} else
LaterDef = LaterMA->getDefiningAccess();
errs() << "isSameMemGeneration:\n";
errs() << " EarlierGeneration = " << EarlierGeneration << "\n";
errs() << " LaterGeneration = " << LaterGeneration << "\n";
errs() << " EarlierInst = " << *EarlierInst << "\n";
errs() << " LaterInst = " << *LaterInst << "\n";
errs() << " LaterDef = " << *LaterDef << "\n";
errs() << " EarlierMA = " << *EarlierMA << "\n";
errs() << " Dominates? = " << MSSA->dominates(LaterDef, EarlierMA) << "\n";
return MSSA->dominates(LaterDef, EarlierMA);
}
// MemorySSA.cpp
template <typename AliasAnalysisType>
static bool
instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc,
const Instruction *UseInst, AliasAnalysisType &AA) {
Instruction *DefInst = MD->getMemoryInst();
assert(DefInst && "Defining instruction not actually an instruction");
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
...
}
errs() << "instructionClobbersQuery:\n";
errs() << " DefInst: " << *DefInst << "\n";
errs() << " UseInst: " << *UseInst << "\n";
if (auto *CB = dyn_cast_or_null<CallBase>(UseInst)) {
ModRefInfo I = AA.getModRefInfo(DefInst, CB);
errs() << " call case\n";
errs() << " isModOrRefSet: " << isModOrRefSet(I) << "\n";
errs() << " isModSet: " << isModSet(I) << "\n";
return isModOrRefSet(I);
}
if (auto *DefLoad = dyn_cast<LoadInst>(DefInst))
if (auto *UseLoad = dyn_cast_or_null<LoadInst>(UseInst)) {
errs() << " load case\n";
errs() << " areLoadsReorderable: "
<< areLoadsReorderable(UseLoad, DefLoad) << "\n";
return !areLoadsReorderable(UseLoad, DefLoad);
}
errs() << " UseLoc: "; UseLoc.print(errs());
ModRefInfo I = AA.getModRefInfo(DefInst, UseLoc);
errs() << " common case\n";
errs() << " isModOrRefSet: " << isModOrRefSet(I) << "\n";
errs() << " isModSet: " << isModSet(I) << "\n";
return isModSet(I);
}
The debug log for fn1
looks as follows:
instructionClobbersQuery:
DefInst: store i32 %inc, ptr @g, align 4, !tbaa !8
UseInst: %2 = load i32, ptr %foo, align 4, !tbaa !3
UseLoc: ptr %foo LocationSize::precise(4)
common case
isModOrRefSet: 1
isModSet: 1
isSameMemGeneration:
EarlierGeneration = 1
LaterGeneration = 2
EarlierInst = %0 = load i32, ptr %foo, align 4, !tbaa !3
LaterInst = %2 = load i32, ptr %foo, align 4, !tbaa !3
LaterDef = 2 = MemoryPhi({entry,liveOnEntry},{if.then,1})
EarlierMA = MemoryUse(liveOnEntry)
Dominates? = 0
And thus the decision is not to remove the second load.
The debug log for fn2
looks as follows:
instructionClobbersQuery:
DefInst: store i32 %inc, ptr @g, align 4, !tbaa !8
UseInst: %2 = load i32, ptr %b, align 4, !tbaa !3
UseLoc: %b = getelementptr inbounds %struct.foo, ptr %foo, i64 0, i32 1 LocationSize::precise(4)
common case
isModOrRefSet: 0
isModSet: 0
isSameMemGeneration:
EarlierGeneration = 1
LaterGeneration = 2
EarlierInst = %0 = load i32, ptr %b, align 4, !tbaa !3
LaterInst = %2 = load i32, ptr %b, align 4, !tbaa !3
LaterDef = 0 = MemoryDef(liveOnEntry)
EarlierMA = MemoryUse(liveOnEntry)
Dominates? = 1
And thus the decision is to remove the second load.
Note, that while UseInst
is the same in both cases, the UseLoc
is not. AA query reports reports ModSet
for load
but not for getelementptr
. Also note that getelementptr
is removed for fn1
because a
is a first field.
Is this an expected behavior?
Thanks,
Eduard
P.S.
Sorry if this is a silly question, I’m debugging a slight change in code generation after some my transformation that replaces loads with intrinsics and tracked it down to this.