Dead stacksave/stackrestore pair not eliminated

Hey all,

I've encountered a potentially-missed optimization opportunity.
It is distilled from a real-world, albeit somewhat unfortunate code-pattern.
The gist of it is that a VLA is used in a loop body, where the length is
actually constant. All (constant-index) accesses to the VLA are mem2reg-d at the
IR level at O1 and above, but because of an intervening-but-unrelated load the
stacksave/stackrestore pair is not eliminated. In the following example, if the
load from a[0] is hoisted outside the loop, the stacksave/stackrestore pair is
successfully eliminated. It can be seen that no allocas are actually present
between the stacksave/stackrestore, and so, the stack save point is dead, if
I'm reasoning about this correctly. It looks like this also becomes an
optimization barrier for vectorizing the loop after unrolling it (change a[0]
to a[i] and -O1 to -O3, won't be vectorized, only unrolled. Further change
int local[dim] to int local[1] and it will also be vectorized).

I was curious see why compiling this into X86 leaves no trace for the IR
stacksave/stackrestore pair, and running it though llc I saw that they survive
all the way down to the peephole optimizer where the redundant $rsp copy is
eliminated (see excerpt below).
Compiler Explorer link: Compiler Explorer

What's the best way to go about debugging this? What part of the optimization
pipeline should I expect to be responsible for this issue?

// foo.c
// clang -O1 foo.c -S -emit-llvm -o foo.ll
int foo(int m, int *a) {
    const int dim = 1;
    int x = 0;

    for (int i = 0; i < 100; i++) {
        int local[dim];
        local[0] = m;
        x += local[0] * a[0];
    }

    return x;
}

// foo.ll
// llc foo.ll -o /dev/null -debug 2>llc_debug.txt
define dso_local i32 @foo(i32 %0, i32* nocapture readonly %1)
local_unnamed_addr #0 !dbg !7 {
  call void @llvm.dbg.value(metadata i32 %0, metadata !14, metadata
!DIExpression()), !dbg !27
  call void @llvm.dbg.value(metadata i32* %1, metadata !15, metadata
!DIExpression()), !dbg !27
  call void @llvm.dbg.value(metadata i32 1, metadata !16, metadata
!DIExpression()), !dbg !27
  call void @llvm.dbg.value(metadata i32 0, metadata !18, metadata
!DIExpression()), !dbg !27
  call void @llvm.dbg.value(metadata i32 0, metadata !19, metadata
!DIExpression()), !dbg !28
  br label %4, !dbg !29

3: ; preds = %4
  ret i32 %10, !dbg !30

4: ; preds = %2, %4
  %5 = phi i32 [ 0, %2 ], [ %10, %4 ]
  %6 = phi i32 [ 0, %2 ], [ %11, %4 ]
  call void @llvm.dbg.value(metadata i32 %5, metadata !18, metadata
!DIExpression()), !dbg !27
  call void @llvm.dbg.value(metadata i32 %6, metadata !19, metadata
!DIExpression()), !dbg !28
  %7 = call i8* @llvm.stacksave(), !dbg !31
  call void @llvm.dbg.value(metadata i32 %0, metadata !21, metadata
!DIExpression()), !dbg !32
  %8 = load i32, i32* %1, align 4, !dbg !33, !tbaa !34
  %9 = mul nsw i32 %8, %0, !dbg !38
  %10 = add nsw i32 %9, %5, !dbg !39
  call void @llvm.dbg.value(metadata i32 %10, metadata !18, metadata
!DIExpression()), !dbg !27
  call void @llvm.stackrestore(i8* %7), !dbg !40
  %11 = add nuw nsw i32 %6, 1, !dbg !41
  call void @llvm.dbg.value(metadata i32 %11, metadata !19, metadata
!DIExpression()), !dbg !28
  %12 = icmp eq i32 %11, 100, !dbg !42
  br i1 %12, label %3, label %4, !dbg !29, !llvm.loop !43
}

// llc_debug.txt
// ...
******** Machine Sinking ********
********** PEEPHOLE OPTIMIZER **********
********** Function: local_arrays
Optimize recurrence chain from %3:gr32 = PHI %13:gr32, %bb.0, %5:gr32, %bb.2
    Inst: %5:gr32 = nsw ADD32rr %15:gr32(tied-def 0), %3:gr32,
implicit-def dead $eflags
        Commuted: %5:gr32 = nsw ADD32rr %3:gr32(tied-def 0), %15:gr32,
implicit-def dead $eflags
Optimize recurrence chain from %4:gr32 = PHI %13:gr32, %bb.0, %6:gr32, %bb.2
    Inst: %6:gr32 = nsw ADD32rr %16:gr32(tied-def 0), %4:gr32,
implicit-def dead $eflags
        Commuted: %6:gr32 = nsw ADD32rr %4:gr32(tied-def 0), %16:gr32,
implicit-def dead $eflags
NAPhysCopy: erasing $rsp = COPY %14:gr64
Deleting redundant copy: $rsp = COPY %14:gr64

Attempting to optimize compare: %17:gr64 = SUB64ri8 %7:gr64(tied-def
0), 100, implicit-def $eflags
DeadMachineInstructionElim: DELETING: %14:gr64 = COPY $rsp
// ...

~Itay