Off-by-one error when accessing stack through `RBP`

Minimal example:

#include <stdio.h>

typedef struct {
    char *data;
    char *other;
} Params;

void test(Params *params) {
    printf("data = %p\n", params->data);
    printf("other = %p\n", params->other);
}

int main() {
    char *input = "Hello, World!";
    Params params;
    params.data = input;
    params.other = NULL;
    test(&params);
    return 0;
}

Output

Running the original and raised binary produces the following output:

$ ./original
data = 0x2004db
other = (nil)
$ ./raised
data = 0x2004f7
other = 0x201640

Machine code

Compiled with clang-13 -O0

test:                                   # @test
        push    rbp
        mov     rbp, rsp
        sub     rsp, 16
        mov     qword ptr [rbp - 8], rdi
        mov     rax, qword ptr [rbp - 8]
        mov     rsi, qword ptr [rax]
        movabs  rdi, offset .L.str
        mov     al, 0
        call    printf
        mov     rax, qword ptr [rbp - 8]
        mov     rsi, qword ptr [rax + 8]
        movabs  rdi, offset .L.str.1
        mov     al, 0
        call    printf
        add     rsp, 16
        pop     rbp
        ret
main:                                   # @main
        push    rbp
        mov     rbp, rsp # updating rbp, which is not handled by mctoll
        sub     rsp, 32
        mov     dword ptr [rbp - 4], 0
        movabs  rax, offset .L.str.2
        mov     qword ptr [rbp - 16], rax
        mov     rax, qword ptr [rbp - 16]
        mov     qword ptr [rbp - 32], rax
        mov     qword ptr [rbp - 24], 0
        lea     rdi, [rbp - 32]
        call    test
        xor     eax, eax
        add     rsp, 32
        pop     rbp
        ret
.L.str:
        .asciz  "data = %p\n"

.L.str.1:
        .asciz  "other = %p\n"

.L.str.2:
        .asciz  "Hello, World!"

Raised bitcode

; ModuleID = 'other/array_acc'
source_filename = "other/array_acc"

@rodata_11 = private unnamed_addr constant [41 x i8] c"\01\00\02\00data = %p\0A\00other = %p\0A\00Hello, World!\00", align 4, !ROData_SecInfo !0

declare dso_local i32 @printf(i8*, ...)

define dso_local i32 @test(i64 %arg1) {
entry:
  %stktop_8 = alloca i8, i32 24, align 1
  %RSPAdj_N.16 = bitcast i8* %stktop_8 to i64*
  %0 = getelementptr i8, i8* %stktop_8, i64 16
  %RBP_N.8 = bitcast i8* %0 to i64*
  %1 = getelementptr i8, i8* %stktop_8, i64 0
  %RSP_P.0 = bitcast i8* %1 to i64*
  store i64 3735928559, i64* %RSP_P.0, align 8
  %RBP = ptrtoint i64* %RSP_P.0 to i64
  store i64 %arg1, i64* %RBP_N.8, align 1
  %memload = load i64, i64* %RBP_N.8, align 1
  %2 = inttoptr i64 %memload to i64*
  %memload1 = load i64, i64* %2, align 1
  %EAX = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @rodata_11, i32 0, i32 4), i64 %memload1)
  %memload2 = load i64, i64* %RBP_N.8, align 1
  %memref-disp = add i64 %memload2, 8
  %3 = inttoptr i64 %memref-disp to i64*
  %memload3 = load i64, i64* %3, align 1
  %EAX4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @rodata_11, i32 0, i32 15), i64 %memload3)
  ret i32 %EAX4
}

define dso_local i32 @main() {
entry:
  %stktop_8 = alloca i8, i32 40, align 1
  %RSPAdj_N.32 = bitcast i8* %stktop_8 to i64*
  %0 = getelementptr i8, i8* %stktop_8, i64 16
  %RBP_N.24 = bitcast i8* %0 to i64*
  %1 = getelementptr i8, i8* %stktop_8, i64 24
  %RBP_N.16 = bitcast i8* %1 to i64*
  %2 = getelementptr i8, i8* %stktop_8, i64 36
  %RBP_N.4 = bitcast i8* %2 to i32*
  %3 = getelementptr i8, i8* %stktop_8, i64 0
  %RSP_P.0 = bitcast i8* %3 to i64*
  store i64 3735928559, i64* %RSP_P.0, align 8
  %RBP = ptrtoint i64* %RSP_P.0 to i64
  store i32 0, i32* %RBP_N.4, align 1
  ; %4 = input
  %4 = ptrtoint i8* getelementptr inbounds ([41 x i8], [41 x i8]* @rodata_11, i32 0, i32 27) to i64, !ROData_Index !1
  store i64 %4, i64* %RBP_N.16, align 1
  %memload = load i64, i64* %RBP_N.16, align 1
  store i64 %memload, i64* %RSPAdj_N.32, align 1
  ; store 0 in RBP_N.24, which is calculated to be %stktop_8[16]
  ; this is done because we have a stack of size 40
  ; the problem is that the value is accessed relative to RBP with 
  ;   mov qword ptr [rbp - 24], 0
  ; Detailed explanation with stack layout drawn attached
  %5 = zext i32 0 to i64
  store i64 %5, i64* %RBP_N.24, align 1
  %RDI = ptrtoint i64* %RSPAdj_N.32 to i64
  %EAX = call i32 @test(i64 %RDI)
  ret i32 0
}

!0 = !{i64 2098368}
!1 = !{i8* getelementptr inbounds ([41 x i8], [41 x i8]* @rodata_11, i32 0, i32 27)}

Stack layout of main

Stack just before calling test:

The total stack size used by the function is 40 bytes, 8 bytes by push rbp, while a further 32 bytes are allocated with sub rsp, 32. Mctoll incorrectly assumes that RBP points to the top of the stack (RSP when entering the function), which is off by 8 bytes, as the new value of RBP is set after push rbp.

I've only observed this bug in programs compiled with -O0, as (as far as I've seen) the stack is accessed relative to RSP with higher optimizations: https://godbolt.org/z/3vn6sMPPd. While I have been trying to fix this issue, I've put it on hold because of this.

microsoft / llvm-mctoll