SPGO incorrectly loading weights since pre-inlined call does not match LTO inlined function

At MediaTek, we noticed situation where counts from an cross module inlined inlined call do not get applied to a block resulting in incorrect PGO information. This can prevent optimizations (inlining, block placement) if the block only has the call IR instruction (and a terminator). This issue occurs with debug info Sample PGO (I tested regular SPGO and CSSPGO) but it does not occur when using Probes.

We were wonder what people know about limitations of matching cross module inlining samples or if people have encountered similar issues?

Example with Minimal Reproduction on X86

Within a function caller, SPGO should tell us that !!G is likely and the callee() is hot. During regular sample loading, the block containing callee has only two LLVM instructions call @callee and br. Sample loading ignores jumps so only call can have samples assigned to it. The call never matches with the debug info so the block and incoming edge have weights of zero.

  br i1 %tobool.not, label %if.end, label %if.then, !dbg !77, !prof !78
  ; !78 = !{!"branch_weights", i32 9853, i32 0}

if.then:                                          ; preds = %for.body
  tail call void (...) @callee() #5, !dbg !79
// file:cross1.c
#include <stdio.h>

extern void callee();

volatile int N = 10000000;
volatile int G = 1;
__attribute__((noinline))
void caller() {
  for (int i = 0; i < N; i++) {
    printf("a");
    if (!!G) // SPGO incorrectly labels this as unlikely
      callee(); // SPGO incorrectly considers this cold
    printf("b");
  }
}

int main() {
  caller();
  return 0;
}
// file:cross2.c
#include <stdio.h>

__attribute__((always_inline))
void bar() {
  printf(".");
}

volatile unsigned short G2;
__attribute__((always_inline))
void callee() {
  G2++;
  bar();
}

Regular SPGO text profile from perf record -b ...

caller:99622:0
 1.1: 0
 1.2: 1729
 1.1025: 1729
 2: 1729
 3: 1699
 5: 1734
 7: 0
 65528: 0
 4: callee:32411
  1: 1699
  2: bar:20518 
   1: 1699
   65532: 1764

Build script in bash

CLANG=clang
PROFGEN=llvm-profgen
DIS=llvm-dis
OBJDUMP=llvm-objdump
# PROBE=-fpseudo-probe-for-profiling
 
$CLANG -O3 -c cross1.c -o cross1.o -flto -g -fdebug-info-for-profiling $PROBE
$CLANG -O3 -c cross2.c -o cross2.o -flto -g -fdebug-info-for-profiling $PROBE
$CLANG cross1.o cross2.o -flto -O3 -o cross -g -fdebug-info-for-profiling  $PROBE -fuse-ld=lld
# default SPGO
perf record -b -- ./cross > /dev/null
# CSSPGO
# perf record -g --call-graph fp -e br_inst_retired.near_taken:uppp -c 16009 -b -- ./cross > /dev/null
$PROFGEN --perfdata ./perf.data --binary ./cross --output ./cross.prof --format text

# DEBUG='-mllvm -debug-only=sample-profile,sample-profile-impl -mllvm -print-after=sample-profile -mllvm -print-before=sample-profile -mllvm -print-module-scope'

$CLANG -O3 -g -c cross1.c -o cross1pgo.o -flto -c $PROBE -fbasic-block-sections=labels -mllvm --pgo-analysis-map='bb-freq,br-prob' -fprofile-sample-use=cross.prof $DEBUG
# inspecting cross1pgo.o.ll the branch_weight being incorrectly biased
$DIS cross1pgo.o

$CLANG -O3 -g -c cross2.c -o cross2pgo.o -flto -c $PROBE -fbasic-block-sections=labels -mllvm --pgo-analysis-map='bb-freq,br-prob' -fprofile-sample-use=cross.prof $DEBUG
$CLANG cross1pgo.o cross2pgo.o -flto -O3 -o crosspgo $PROBE -fuse-ld=lld -fprofile-sample-use=cross.prof -fbasic-block-sections=labels -Wl,-mllvm,--basic-block-sections=labels -Wl,-mllvm,--pgo-analysis-map='br-prob'

$OBJDUMP --symbolize-operands --pretty-pgo-analysis-map -S crosspgo