Always inline access to registers

It dramatically reduces the build size and improve performance, especially when using opt-level = "z"

Take the following function, writing repeatedly to the SPI device:

pub unsafe fn spi_write_repeated(&mut self, data: u16, count: usize) {
    for _ in 0..count {
        let regs = embassy_stm32::pac::SPI1;
        while !regs.sr().read().txe() {}
        regs.dr().write(|w| w.set_dr(data));
    }
}

With opt-level = "z", it compiles to the following (I've added some labels for better readability)

0802bc44 <spi_write_repeated>:
 802bc44: f0 b5         push    {r4, r5, r6, r7, lr}
 802bc46: 03 af         add r7, sp, #12
 802bc48: 2d e9 f0 07   push.w  {r4, r5, r6, r7, r8, r9, r10}
 802bc4c: 0f 4d         ldr r5, [pc, #60]           @ 0x802bc8c <$d.52>
 802bc4e: 0d f1 0c 08   add.w   r8, sp, #12
 802bc52: 27 f8 22 1c   strh    r1, [r7, #-34]
 802bc56: 02 ae         add r6, sp, #8
 802bc58: a7 f1 22 09   sub.w   r9, r7, #34
 802bc5c: 92 46         mov r10, r2
 802bc5e: 00 24         movs    r4, #0
 label_loop:
   802bc60: 54 45           cmp r4, r10
   802bc62: 10 d0           beq label_out
   label_spin:
     802bc64: 28 46         mov r0, r5
     802bc66: ef f7 17 fb   bl  0x801b298 <stm32_metapac::common::Reg<T,A>::from_ptr::h29abb4b4782ce51e> @ imm = #-68050
     802bc6a: 02 90         str r0, [sp, #8]
     802bc6c: 30 46         mov r0, r6
     802bc6e: ef f7 17 fb   bl  0x801b2a0 <stm32_metapac::common::Reg<T,A>::read::h91fe28e3236dba52> @ imm = #-68050
     802bc72: 80 07         lsls    r0, r0, #30
     802bc74: f6 d5         bpl label_loop
   802bc76: 28 1d           adds    r0, r5, #4
   802bc78: 03 90           str r0, [sp, #12]
   802bc7a: 40 46           mov r0, r8
   802bc7c: 49 46           mov r1, r9
   802bc7e: fb f7 21 fa     bl  0x80270c4 <stm32_metapac::common::Reg<T,A>::write::hebd52954a8b97d57> @ imm = #-19390
 802bc82: 01 34         adds    r4, #1
 802bc84: ec e7         b   label_loop
label_out:
 802bc86: bd e8 0f 07   pop.w   {r0, r1, r2, r3, r8, r9, r10}
 802bc8a: f0 bd         pop {r4, r5, r6, r7, pc}

0802bc8c <$d.52>:
 802bc8c:   08 30 01 40 .word   0x40013008

With the inline notations this PR brings:

0802af14 <spi_write_repeated>:
 802af14: d0 b5         push    {r4, r6, r7, lr}
 802af16: 02 af         add r7, sp, #8
 802af18: 05 4b         ldr r3, [pc, #20]           @ 0x802af30 <$d.52>
 802af1a: 88 b2         uxth    r0, r1
 802af1c: 00 21         movs    r1, #0
 label_loop:
   802af1e: 91 42           cmp r1, r2
   802af20: 05 d0           beq label_out
   label_spin:
     802af22: 1c 68         ldr r4, [r3]
     802af24: a4 07         lsls    r4, r4, #30
     802af26: fc d5         bpl label_spin
   802af28: 58 60           str r0, [r3, #4]
   802af2a: 01 31           adds    r1, #1
   802af2c: f7 e7           b   label_loop
 label_out:
 802af2e: d0 bd         pop {r4, r6, r7, pc}

0802af30 <$d.52>:
 802af30:   08 30 01 40 .word   0x40013008

The code is much faster, and smaller.

I don't think there's any reason that we don't want to always inline register accesses. What do you think?

embassy-rs / chiptool

Always inline access to registers #4