At MediaTek, we noticed situation where counts from an cross module inlined inlined call do not get applied to a block resulting in incorrect PGO information. This can prevent optimizations (inlining, block placement) if the block only has the call IR instruction (and a terminator). This issue occurs with debug info Sample PGO (I tested regular SPGO and CSSPGO) but it does not occur when using Probes.
We were wonder what people know about limitations of matching cross module inlining samples or if people have encountered similar issues?
Example with Minimal Reproduction on X86
Within a function caller
, SPGO should tell us that !!G
is likely and the callee()
is hot. During regular sample loading, the block containing callee has only two LLVM instructions call @callee
and br
. Sample loading ignores jumps so only call
can have samples assigned to it. The call never matches with the debug info so the block and incoming edge have weights of zero.
br i1 %tobool.not, label %if.end, label %if.then, !dbg !77, !prof !78
; !78 = !{!"branch_weights", i32 9853, i32 0}
if.then: ; preds = %for.body
tail call void (...) @callee() #5, !dbg !79
// file:cross1.c
#include <stdio.h>
extern void callee();
volatile int N = 10000000;
volatile int G = 1;
__attribute__((noinline))
void caller() {
for (int i = 0; i < N; i++) {
printf("a");
if (!!G) // SPGO incorrectly labels this as unlikely
callee(); // SPGO incorrectly considers this cold
printf("b");
}
}
int main() {
caller();
return 0;
}
// file:cross2.c
#include <stdio.h>
__attribute__((always_inline))
void bar() {
printf(".");
}
volatile unsigned short G2;
__attribute__((always_inline))
void callee() {
G2++;
bar();
}
Regular SPGO text profile from perf record -b ...
caller:99622:0
1.1: 0
1.2: 1729
1.1025: 1729
2: 1729
3: 1699
5: 1734
7: 0
65528: 0
4: callee:32411
1: 1699
2: bar:20518
1: 1699
65532: 1764
Build script in bash
CLANG=clang
PROFGEN=llvm-profgen
DIS=llvm-dis
OBJDUMP=llvm-objdump
# PROBE=-fpseudo-probe-for-profiling
$CLANG -O3 -c cross1.c -o cross1.o -flto -g -fdebug-info-for-profiling $PROBE
$CLANG -O3 -c cross2.c -o cross2.o -flto -g -fdebug-info-for-profiling $PROBE
$CLANG cross1.o cross2.o -flto -O3 -o cross -g -fdebug-info-for-profiling $PROBE -fuse-ld=lld
# default SPGO
perf record -b -- ./cross > /dev/null
# CSSPGO
# perf record -g --call-graph fp -e br_inst_retired.near_taken:uppp -c 16009 -b -- ./cross > /dev/null
$PROFGEN --perfdata ./perf.data --binary ./cross --output ./cross.prof --format text
# DEBUG='-mllvm -debug-only=sample-profile,sample-profile-impl -mllvm -print-after=sample-profile -mllvm -print-before=sample-profile -mllvm -print-module-scope'
$CLANG -O3 -g -c cross1.c -o cross1pgo.o -flto -c $PROBE -fbasic-block-sections=labels -mllvm --pgo-analysis-map='bb-freq,br-prob' -fprofile-sample-use=cross.prof $DEBUG
# inspecting cross1pgo.o.ll the branch_weight being incorrectly biased
$DIS cross1pgo.o
$CLANG -O3 -g -c cross2.c -o cross2pgo.o -flto -c $PROBE -fbasic-block-sections=labels -mllvm --pgo-analysis-map='bb-freq,br-prob' -fprofile-sample-use=cross.prof $DEBUG
$CLANG cross1pgo.o cross2pgo.o -flto -O3 -o crosspgo $PROBE -fuse-ld=lld -fprofile-sample-use=cross.prof -fbasic-block-sections=labels -Wl,-mllvm,--basic-block-sections=labels -Wl,-mllvm,--pgo-analysis-map='br-prob'
$OBJDUMP --symbolize-operands --pretty-pgo-analysis-map -S crosspgo