Optimizing Load-multiple and Store-multiple on LLVM

After reading “5.6 The Load-Multiple and Store-Multiple Instructions” chapter from "Design of the RISC-V Instruction Set Architecture " https://people.eecs.berkeley.edu/~krste/papers/EECS-2016-1.pdf I decided to verify how GCC and LLVM had actually implemented the load-multiple and save-multiple instructions in order to save and restore up to 13 registers with a single command (ra + s0-s11).

Here is their current implementation:

The two implementations are almost identical (the only difference is that GCC, in __riscv_save_XX, performs an extra “slli t1, t1, 4” which can be easily avoided by pre-shifting some constants).
Both have a pair of functions dedicated to the simplest case (__riscv_save_0-1 and __riscv_restore_0-1) and a pair of generic functions for all the other cases (__riscv_save_2-11 and __riscv_restore_2-11).

The implementation of the generic functions doesn’t seem optimal to me: __riscv_save_2–11 uses a chain of jumps that could be avoided by organizing the function differently; similarly, __riscv_restore_2–11 uses a chain of “addi sp, sp, 16” which could also be avoided.

Saving ra/s0/…/s11 in reverse order on the stack could further simplify and speed up the implementation, but I fear that this change might violate the RISC-V ABI (at least when using the frame pointer: see https://lists.riscv.org/g/tech-psabi/attachment/154/0/Qualcomm%20RISC-V%20Push&Pop&FP%20Proposal.pdf ).

Below you’ll find the original LLVM code (only in the 64-bit version for simplicity), my first proposal that uses a single “jump” and a single “add …” even when saving up to 13 registers, and my second proposal, which is even simpler (but reverses the order in which the registers are saved on the stack).

Let me know what you think (but above all, let me know whether my proposals make sense or if, for some reason, they are not feasible/beneficial).

LLVM original save/restore 64bit:

__riscv_save_12:
addi   sp, sp, -112
mv     t1, zero
sd     s11, 8(sp)
j      .Lriscv_save_11_10

__riscv_save_11:
__riscv_save_10:
addi   sp, sp, -112
li     t1, 16
.Lriscv_save_11_10:
sd     s10, 16(sp)
sd     s9,  24(sp)
j      .Lriscv_save_9_8

__riscv_save_9:
__riscv_save_8:
addi   sp, sp, -112
li     t1, 32
.Lriscv_save_9_8:
sd     s8,  32(sp)
sd     s7,  40(sp)
j      .Lriscv_save_7_6

__riscv_save_7:
__riscv_save_6:
addi   sp, sp, -112
li     t1, 48
.Lriscv_save_7_6:
sd     s6,  48(sp)
sd     s5,  56(sp)
j      .Lriscv_save_5_4

__riscv_save_5:
__riscv_save_4:
addi   sp, sp, -112
li     t1, 64
.Lriscv_save_5_4:
sd     s4, 64(sp)
sd     s3, 72(sp)
j      .Lriscv_save_3_2

__riscv_save_3:
__riscv_save_2:
addi   sp, sp, -112
li     t1, 80
.Lriscv_save_3_2:
sd     s2, 80(sp)
sd     s1, 88(sp)
sd     s0, 96(sp)
sd     ra, 104(sp)
add    sp, sp, t1
jr     t0

__riscv_save_1:
__riscv_save_0:
addi   sp, sp, -16
sd     s0, 0(sp)
sd     ra, 8(sp)
jr     t0

__riscv_restore_12:
ld      s11, 8(sp)
addi    sp, sp, 16
__riscv_restore_11:
__riscv_restore_10:
ld      s10, 0(sp)
ld      s9,  8(sp)
addi    sp, sp, 16
__riscv_restore_9:
__riscv_restore_8:
ld      s8,  0(sp)
ld      s7,  8(sp)
addi    sp, sp, 16
__riscv_restore_7:
__riscv_restore_6:
ld      s6,  0(sp)
ld      s5,  8(sp)
addi    sp, sp, 16
__riscv_restore_5:
__riscv_restore_4:
ld      s4,  0(sp)
ld      s3,  8(sp)
addi    sp, sp, 16
__riscv_restore_3:
__riscv_restore_2:
ld      s2,  0(sp)
ld      s1,  8(sp)
addi    sp, sp, 16
__riscv_restore_1:
__riscv_restore_0:
ld      s0,  0(sp)
ld      ra,  8(sp)
addi    sp, sp, 16
ret

My save/restore 64bit proposal (without “jump” chain):

__riscv_save_11:
__riscv_save_10:
addi   sp, sp, -112
li     t1, 16
j      .Lriscv_save_11_10

__riscv_save_9:
__riscv_save_8:
addi   sp, sp, -112
li     t1, 32
j      .Lriscv_save_9_8

__riscv_save_7:
__riscv_save_6:
addi   sp, sp, -112
li     t1, 48
j      .Lriscv_save_7_6

__riscv_save_5:
__riscv_save_4:
addi   sp, sp, -112
li     t1, 64
j      .Lriscv_save_5_4

# This specific case executes 1 more instruction than LLVM so probably deserves an ad-hoc function like __riscv_save_0-1:
__riscv_save_3:
__riscv_save_2:
addi   sp, sp, -112
li     t1, 80
j      .Lriscv_save_3_2

__riscv_save_12:
addi   sp, sp, -112
li     t1, 0
sd     s11, 8(sp)
.Lriscv_save_11_10:
sd     s10, 16(sp)
sd     s9,  24(sp)
.Lriscv_save_9_8:
sd     s8,  32(sp)
sd     s7,  40(sp)
.Lriscv_save_7_6:
sd     s6,  48(sp)
sd     s5,  56(sp)
.Lriscv_save_5_4:
sd     s4, 64(sp)
sd     s3, 72(sp)
.Lriscv_save_3_2:
sd     s2, 80(sp)
sd     s1, 88(sp)
sd     s0, 96(sp)
sd     ra, 104(sp)
add    sp, sp, t1
jr     t0

__riscv_save_1:
__riscv_save_0:
addi   sp, sp, -16
sd     s0, 0(sp)
sd     ra, 8(sp)
jr     t0

__riscv_restore_11:
__riscv_restore_10:
addi   sp, sp, -16
j      .Lriscv_save_11_10

__riscv_restore_9:
__riscv_restore_8:
addi   sp, sp, -32
j      .Lriscv_save_9_8

__riscv_restore_7:
__riscv_restore_6:
addi   sp, sp, -48
j      .Lriscv_save_7_6

__riscv_restore_5:
__riscv_restore_4:
addi   sp, sp, -64
j      .Lriscv_save_5_4

__riscv_restore_12:
ld     s11, 8(sp)
.Lriscv_restore_11_10:
ld     s10, 16(sp)
ld     s9,  24(sp)
.Lriscv_restore_9_8:
ld     s8,  32(sp)
ld     s7,  40(sp)
.Lriscv_restore_7_6:
ld     s6,  48(sp)
ld     s5,  56(sp)
.Lriscv_restore_5_4:
ld     s4, 64(sp)
ld     s3, 72(sp)
ld     s2, 80(sp)
ld     s1, 88(sp)
ld     s0, 96(sp)
ld     ra, 104(sp)
addi   sp, sp, 112
ret

# For __riscv_restore_2-3 I use LLVM approach because in this specific case my approach would execute 1 more instruction than LLVM
__riscv_restore_3:
__riscv_restore_2:
ld      s2,  0(sp)
ld      s1,  8(sp)
addi    sp, sp, 16
__riscv_restore_1:
__riscv_restore_0:
ld      s0,  0(sp)
ld      ra,  8(sp)
addi    sp, sp, 16
ret

My alternative save/restore 64bit proposal (simpler but with reverse register order):

__riscv_save_11:
__riscv_save_10:
addi   sp, sp, -96
j      .Lriscv_save_11_10

__riscv_save_9:
__riscv_save_8:
addi   sp, sp, -80
j      .Lriscv_save_9_8

__riscv_save_7:
__riscv_save_6:
addi   sp, sp, -64
j      .Lriscv_save_7_6

__riscv_save_5:
__riscv_save_4:
addi   sp, sp, -48
j      .Lriscv_save_5_4

__riscv_save_3:
__riscv_save_2:
addi   sp, sp, -32
j      .Lriscv_save_3_2

__riscv_save_12:
addi   sp, sp, -112
sd     s11, 96(sp)
.Lriscv_save_11_10:
sd     s10, 88(sp)
sd     s9, 80(sp)
.Lriscv_save_9_8:
sd     s8, 72(sp)
sd     s7, 64(sp)
.Lriscv_save_7_6:
sd     s6, 56(sp)
sd     s5, 48(sp)
.Lriscv_save_5_4:
sd     s4, 40(sp)
sd     s3, 32(sp)
.Lriscv_save_3_2:
sd     s2, 24(sp)
sd     s1, 16(sp)
sd     s0, 8(sp)
sd     ra, 0(sp)
jr     t0

__riscv_save_1:
__riscv_save_0:
addi   sp, sp, -16
sd     s0, 8(sp)
sd     ra, 0(sp)
jr     t0

__riscv_restore_11:
__riscv_restore_10:
li     t1, 96
j .Lriscv_restore_11_10

__riscv_restore_9:
__riscv_restore_8:
li     t1, 80
j .Lriscv_restore_9_8

__riscv_restore_7:
__riscv_restore_6:
li     t1, 64
j .Lriscv_restore_7_6

__riscv_restore_5:
__riscv_restore_4:
li     t1, 48
j .Lriscv_restore_5_4

__riscv_restore_12:
li     t1, 112
sd     s11, 96(sp)
.Lriscv_restore_11_10
sd     s10, 88(sp)
sd     s9, 80(sp)
.Lriscv_restore_9_8:
sd     s8, 72(sp)
sd     s7, 64(sp)
.Lriscv_restore_7_6:
sd     s6, 56(sp)
sd     s5, 48(sp)
.Lriscv_restore_5_4:
sd     s4, 40(sp)
sd     s3, 32(sp)
sd     s2, 24(sp)
sd     s1, 16(sp)
sd     s0, 8(sp)
sd     ra, 0(sp)
add    sp, sp, t1
ret

# Here I use an ad-hoc function because in this specific case my approach would execute 1 more instruction than LLVM
__riscv_restore_3:
__riscv_restore_2:
ld      s2,  24(sp)
ld      s1,  16(sp)
ld      s0,  8(sp)
ld      ra,  0(sp)
addi    sp, sp, 32
ret

__riscv_restore_1:
__riscv_restore_0:
ld      s0,  8(sp)
ld      ra,  0(sp)
addi    sp, sp, 16
ret