Optimize calls and recursive calls?

cosmos72 commented 3 years ago

I am evaluating mir, and I noticed some redundancy in generated code for function calls.

The example below is intended to be equivalent to this C code:

uint64_t fibonacci(uint64_t n) {
  if (n > 2) {
    n = fibonacci(n-1) + fibonacci(n-2)
  } else {
    n = 1
  }
  return n;
}

Here is the actual mir code I tried:

#include "mir.h"
#include "mir-gen.h"

int main(void) {
    const int thread = 0;

    MIR_context_t ctx = MIR_init();
    MIR_gen_init(ctx, thread + 1);
    MIR_gen_set_optimize_level(ctx, thread, 3 /*-O3*/);

    MIR_module_t m = MIR_new_module(ctx, "m");

    MIR_type_t ret_types[] = {MIR_T_I64};
    MIR_item_t func = MIR_new_func(ctx, "fibonacci", 1, ret_types, 1, MIR_T_I64, "n");
    MIR_reg_t N = MIR_reg(ctx, "n", func->u.func);
    MIR_reg_t N_1 = MIR_new_func_reg(ctx, func->u.func, MIR_T_I64, "n_1");
    MIR_reg_t N_2 = MIR_new_func_reg(ctx, func->u.func, MIR_T_I64, "n_2");
    MIR_label_t small = MIR_new_label(ctx), out = MIR_new_label(ctx);

    MIR_item_t fwd = MIR_new_forward(ctx, "fibonacci");
    MIR_item_t proto = MIR_new_proto(ctx, "fibonacci_proto_", 1, ret_types, 1, MIR_T_I64, "n");

    MIR_append_insn(ctx, func,
                    MIR_new_insn(ctx, MIR_UBLE, MIR_new_label_op(ctx, small),
                                 MIR_new_reg_op(ctx, N), MIR_new_uint_op(ctx, 2)));
    MIR_append_insn(ctx, func,
                    MIR_new_insn(ctx, MIR_SUB, MIR_new_reg_op(ctx, N_1), MIR_new_reg_op(ctx, N),
                                 MIR_new_uint_op(ctx, 1)));
    MIR_append_insn(ctx, func,
                    MIR_new_insn(ctx, MIR_SUB, MIR_new_reg_op(ctx, N_2), MIR_new_reg_op(ctx, N),
                                 MIR_new_uint_op(ctx, 2)));

    MIR_append_insn(ctx, func,
                    MIR_new_call_insn(ctx, 4, MIR_new_ref_op(ctx, proto), MIR_new_ref_op(ctx, fwd),
                                      MIR_new_reg_op(ctx, N_1), MIR_new_reg_op(ctx, N_1)));
    MIR_append_insn(ctx, func,
                    MIR_new_call_insn(ctx, 4, MIR_new_ref_op(ctx, proto), MIR_new_ref_op(ctx, fwd),
                                      MIR_new_reg_op(ctx, N_2), MIR_new_reg_op(ctx, N_2)));
    MIR_append_insn(ctx, func,
                    MIR_new_insn(ctx, MIR_ADD, MIR_new_reg_op(ctx, N), MIR_new_reg_op(ctx, N_1),
                                 MIR_new_reg_op(ctx, N_2)));
    MIR_append_insn(ctx, func, MIR_new_insn(ctx, MIR_JMP, MIR_new_label_op(ctx, out)));
    MIR_append_insn(ctx, func, small);
    MIR_append_insn(ctx, func,
                    MIR_new_insn(ctx, MIR_MOV, MIR_new_reg_op(ctx, N), MIR_new_uint_op(ctx, 1)));
    MIR_append_insn(ctx, func, out);
    MIR_append_insn(ctx, func, MIR_new_ret_insn(ctx, 1, MIR_new_reg_op(ctx, N)));
    MIR_finish_func(ctx);

    MIR_finish_module(ctx);
    MIR_load_module(ctx, m);

    MIR_link(ctx, MIR_set_gen_interface, nullptr);

    void *gen = MIR_gen(ctx, thread, func);
    uint64_t (*fibonacci)(uint64_t arg1) = (uint64_t(*)(uint64_t))gen;

    printf("%lu\n", (unsigned long)fibonacci(45));

    MIR_gen_finish(ctx);

    FILE *f = fopen("generated.c_api.mir", "w");
    MIR_output(ctx, f);
    fclose(f);
    MIR_finish(ctx);
    return 0;
}

and on Linux/x86-64 the disassembly of generated code is:

0x00007ffff7ffb020:  movabs $0x7ffff7ffb030,%r11
0x00007ffff7ffb02a:  jmp    *%r11
...
0x00007ffff7ffb030:  mov    %rbp,-0x8(%rsp)
0x00007ffff7ffb035:  lea    -0x8(%rsp),%rbp
0x00007ffff7ffb03a:  sub    $0x18,%rsp
0x00007ffff7ffb03e:  mov    %rbx,-0x10(%rbp)
0x00007ffff7ffb042:  mov    %r12,-0x8(%rbp)
0x00007ffff7ffb046:  cmp    $0x2,%rdi
0x00007ffff7ffb04a:  jbe    0x7ffff7ffb08b
0x00007ffff7ffb050:  mov    %rdi,%rax
0x00007ffff7ffb053:  sub    $0x1,%rax
0x00007ffff7ffb057:  mov    %rax,%rbx
0x00007ffff7ffb05a:  sub    $0x2,%rdi
0x00007ffff7ffb05e:  mov    %rdi,%r12
0x00007ffff7ffb061:  movabs $0x7ffff7ffb020,%rax
0x00007ffff7ffb06b:  mov    %rbx,%rdi
0x00007ffff7ffb06e:  call   *%rax
0x00007ffff7ffb070:  mov    %rax,%rbx
0x00007ffff7ffb073:  movabs $0x7ffff7ffb020,%rax
0x00007ffff7ffb07d:  mov    %r12,%rdi
0x00007ffff7ffb080:  call   *%rax
0x00007ffff7ffb082:  lea    (%rbx,%rax,1),%rdi
0x00007ffff7ffb086:  jmp    0x7ffff7ffb092
0x00007ffff7ffb08b:  mov    $0x1,%rdi
0x00007ffff7ffb092:  mov    %rdi,%rax
0x00007ffff7ffb095:  mov    -0x10(%rbp),%rbx
0x00007ffff7ffb099:  mov    -0x8(%rbp),%r12
0x00007ffff7ffb09d:  lea    0x8(%rbp),%rsp
0x00007ffff7ffb0a1:  mov    -0x8(%rsp),%rbp
0x00007ffff7ffb0a6:  ret

My question is: the movabs followed by indirect jmp/call through a register are quite inefficient. Is there a reason for them, or they could be optimized to direct jmp/call instructions ?

dibyendumajumdar commented 3 years ago

Hi I believe the indirect call is to allow MIR to support replacing the compiled code

cosmos72 commented 3 years ago

You mean to allow replacing a generated function with a different one, and updating all calls to it? That certainly is a useful feature when needed. But is there also a way to disable it when not needed?

dibyendumajumdar commented 3 years ago

This was a previous comment by Vladimir:

Also all calls are done through small thunks requiring 2 x86-64 insns which might slow down the code too. Usage of the thunks can be used for some applications to change called function code w/o changing a caller code. But may be it is not necessary for some applications and I could make an option to use a direct call. I will think about it.

vnmakarov commented 3 years ago

Massimiliano, thank you for your feedback. Dibyendu correctly wrote my explanation for the current implementation of calls.

I will implement direct calls as it helps c2m performance and people started to express more interest in c2m. I only can not say when it will be done because there are a lot of other important things for MIR project should be done. Also probably this feature will be not in the 1st release of MIR.

As for recursive call optimization (I guess it is a tail call elimination), I am thinking about how to tackle this problem. It is important for MIR to be used for functional language implementation. Implementing this optimization is not an easy task. But I guess I will make MIR to express tail calls at least.

cosmos72 commented 2 years ago

I am actually starting to use MIR as (one of) the backend for a small JIT project. Would you be willing to accept a PR for an additional optimizazion flag that would replace the current indirect call mechanism with direct calls? I am asking because I may actually try to implement it...

vnmakarov commented 2 years ago

If you need just to call one function directly, this featuire will be in the next release. Here is the patch for x86-64 (function returning address for direct call):

diff --git a/mir-x86_64.c b/mir-x86_64.c
index cc6a049..c492324 100644
--- a/mir-x86_64.c
+++ b/mir-x86_64.c
@@ -173,6 +173,20 @@ void _MIR_redirect_thunk (MIR_context_t ctx, void *thunk, void *to) {
   _MIR_update_code (ctx, thunk, 1, 2, to);
 }

+void *_MIR_get_jmpi_thunk (MIR_context_t ctx, void **res_loc, void *res, void *cont) {
+  void *thunk;
+  static const uint8_t pattern[] = {
+    0x49, 0xba, 0,    0, 0, 0, 0, 0, 0, 0, /* 0x0: movabsq 0, r10 */
+    0x49, 0xbb, 0,    0, 0, 0, 0, 0, 0, 0, /* 0xa: movabsq 0, r11 */
+    0x4d, 0x89, 0x1a,                      /* 0x14: movq %r11, (%r10) */
+    0x49, 0xbb, 0,    0, 0, 0, 0, 0, 0, 0, /* 0x17: movabsq 0, r11 */
+    0x41, 0xff, 0xe3,                      /* 0x21: jmpq   *%r11 */
+  };
+  thunk = _MIR_publish_code (ctx, pattern, sizeof (pattern));
+  _MIR_update_code (ctx, thunk, 3, 2, res_loc, 12, res, 25, cont);
+  return thunk;
+}
+
 static const uint8_t save_pat[] = {
 #ifndef _WIN32
   0x48, 0x81, 0xec, 0x80, 0,    0,    0, /*sub    $0x88,%rsp              */

If you want to implement that all functions after resolving call each other directly, it is harder to implement. It will require to store all addresses where calls are done and patch these call sites in the resolution pass. The patching will be really machine-dependent. I am not sure I am going to implement this. In any case the pull requests are welcome for this. I'll consider inclusion them into repository and the future releases.

vnmakarov commented 2 years ago

Sorry, I gave a wrong patch. The actual function to get address for direct call is _MIR_get_thunk_addr and can be found on bbv branch.

cosmos72 commented 2 years ago

Hi @vnmakarov, the function _MIR_get_thunk_addr in branch bbv contains:

void *_MIR_get_thunk_addr (MIR_context_t ctx, void *thunk) {
  void *addr;
  memcpy ((char *) &addr, (char *) thunk + 2, sizeof (addr));
  return addr;
}

which retrieves the address from prefix (thunk) at the beginning of every jit-compiled function (what follows is on x86_64):

   0x00007ffff7fc5020:  movabs $0x7ffff7fc5030,%r11 // actual address will obviously vary - it's usually IP + 16
   0x00007ffff7fc502a:  jmp    *%r11
   // ..

This is a catch 22 as its needs as argument the address of the jit-compiled function, which I have to retrieve first.

So I prepared a very rough patch to directly call functions on x86_64 - you can find it at https://github.com/cosmos72/mir/commit/a57bff254c584a93885aa5c4a8eafb97f018912e - it works in most cases, but crashes ./run-test -gd ./mir-tests/test9.mir because it calls the wrong address:

// ...
Disassembly of section .text:

00007feeed782170 <code>:
    7feeed782170:       48 89 6c 24 f8          mov    %rbp,-0x8(%rsp)
    7feeed782175:       48 8d 6c 24 f8          lea    -0x8(%rsp),%rbp
    7feeed78217a:       48 83 ec 18             sub    $0x18,%rsp
    7feeed78217e:       48 89 5d f0             mov    %rbx,-0x10(%rbp)
    7feeed782182:       48 b9 80 f1 3f 2e 9e    movabs $0x559e2e3ff180,%rcx
    7feeed782189:       55 00 00
    7feeed78218c:       f2 44 0f 10 01          movsd  (%rcx),%xmm8
    7feeed782191:       f2 44 0f 11 45 f8       movsd  %xmm8,-0x8(%rbp)
    7feeed782197:       33 ff                   xor    %edi,%edi
    7feeed782199:       48 c7 c6 2a 00 00 00    mov    $0x2a,%rsi
    7feeed7821a0:       33 c0                   xor    %eax,%eax
    7feeed7821a2:       e8 b9 ff ff ff          call   7feeed782160 <code-0x10>
    7feeed7821a7:       48 c7 c7 01 00 00 00    mov    $0x1,%rdi
    7feeed7821ae:       f2 0f 10 45 f8          movsd  -0x8(%rbp),%xmm0
    7feeed7821b3:       48 c7 c0 01 00 00 00    mov    $0x1,%rax
    7feeed7821ba:       e8 a1 ff ff ff          call   7feeed782160 <code-0x10>
    7feeed7821bf:       33 ff                   xor    %edi,%edi
    7feeed7821c1:       48 c7 c6 2b 00 00 00    mov    $0x2b,%rsi
    7feeed7821c8:       33 c0                   xor    %eax,%eax
    7feeed7821ca:       e8 91 ff ff ff          call   7feeed782160 <code-0x10>
    7feeed7821cf:       48 b8 10 f6 3f 2e 9e    movabs $0x559e2e3ff610,%rax
    7feeed7821d6:       55 00 00
    7feeed7821d9:       f2 0f 10 00             movsd  (%rax),%xmm0
    7feeed7821dd:       48 c7 c7 01 00 00 00    mov    $0x1,%rdi
    7feeed7821e4:       48 c7 c0 01 00 00 00    mov    $0x1,%rax
    7feeed7821eb:       e8 70 ff ff ff          call   7feeed782160 <code-0x10>
    7feeed7821f0:       48 bb 83 6d f8 2c 9e    movabs $0x559e2cf86d83,%rbx
    7feeed7821f7:       55 00 00
    7feeed7821fa:       33 ff                   xor    %edi,%edi
    7feeed7821fc:       ff d3                   call   *%rbx
    7feeed7821fe:       48 8b 5d f0             mov    -0x10(%rbp),%rbx
    7feeed782202:       48 8d 65 08             lea    0x8(%rbp),%rsp
    7feeed782206:       48 8b 6c 24 f8          mov    -0x8(%rsp),%rbp
    7feeed78220b:       c3                      ret
    7feeed78220c:       00 00                   add    %al,(%rax)
        ...
code size = 160:
  Code generation for main: 32 MIR insns (addr=7feeed782170, len=160) -- time 16.69 ms
+++++++++++++The code for main has been already generated
Segmentation fault

Any suggestion where call 7feeed782160 <code-0x10> comes from? The correct code would obviously be call 7feeed782170 <code>

vnmakarov commented 2 years ago

Any suggestion where call 7feeed782160 <code-0x10> comes from? The correct code would obviously be call 7feeed782170 <code>

It might be code (probably a thunk) generated before the current one.

I think that the optimization you are implementing is constrained especially for recursive functions as we need to generate func code first before calling it. I guess it would be better to store all places where functions are called and patch them when all functions are generated (in MIR_link).

Still such optimization is impossible for lazy func code generation. And we can not patch function references during execution as it means possibility of different addresses for one function (which makes comparison of function pointers not working).

vnmakarov / mir

Optimize calls and recursive calls? #153