Code generation for unboxed computations

(I am not positive this issue is not a duplicate of #1783)

The following code:

let[@inline never] get_one () = #1.

let f () =
  let acc = #0. in
  let one = get_one () in
  let acc = Stdlib__Float_u.add acc one in
  let one = get_one () in
  let acc = Stdlib__Float_u.add acc one in
  let one = get_one () in
  let acc = Stdlib__Float_u.add acc one in
  let one = get_one () in
  let acc = Stdlib__Float_u.add acc one in
  let one = get_one () in
  let acc = Stdlib__Float_u.add acc one in
  let one = get_one () in
  let acc = Stdlib__Float_u.add acc one in
  let one = get_one () in
  let acc = Stdlib__Float_u.add acc one in
  let one = get_one () in
  let acc = Stdlib__Float_u.add acc one in
  acc

results in the following assembly:

        movl    $1, %eax
        call    camlT0__get_one_0_2_code@PLT
.L113:
        movsd   %xmm0, (%rsp)
        movl    $1, %eax
        call    camlT0__get_one_0_2_code@PLT
.L114:
        movsd   %xmm0, 8(%rsp)
        movl    $1, %eax
        call    camlT0__get_one_0_2_code@PLT
.L115:
        movsd   %xmm0, 16(%rsp)
        movl    $1, %eax
        call    camlT0__get_one_0_2_code@PLT
.L116:
        movsd   %xmm0, 24(%rsp)
        movl    $1, %eax
        call    camlT0__get_one_0_2_code@PLT
.L117:
        movsd   %xmm0, 32(%rsp)
        movl    $1, %eax
        call    camlT0__get_one_0_2_code@PLT
.L118:
        movsd   %xmm0, 40(%rsp)
        movl    $1, %eax
        call    camlT0__get_one_0_2_code@PLT
.L119:
        movsd   %xmm0, 48(%rsp)
        movl    $1, %eax
        call    camlT0__get_one_0_2_code@PLT
.L120:
        movapd  %xmm0, %xmm1
        xorpd   %xmm0, %xmm0
        movsd   (%rsp), %xmm2
        addsd   %xmm2, %xmm0
        movsd   8(%rsp), %xmm2
        addsd   %xmm2, %xmm0
        movsd   16(%rsp), %xmm2
        addsd   %xmm2, %xmm0
        movsd   24(%rsp), %xmm2
        addsd   %xmm2, %xmm0
        movsd   32(%rsp), %xmm2
        addsd   %xmm2, %xmm0
        movsd   40(%rsp), %xmm2
        addsd   %xmm2, %xmm0
        movsd   48(%rsp), %xmm2
        addsd   %xmm2, %xmm0
        addsd   %xmm1, %xmm0
        addq    $56, %rsp

It consumes many stack slots because the calls are pushed up, making the variables live from the calls to the end of the computation as shown by the CMM expression:

(function{t0.ml:3,6-587} camlT0__f_1_3_code (param/385: int)
 (let
   (one/386 (app{t0.ml:5,12-22} G:"camlT0__get_one_0_2_code" 1 float)
    one/388 (app{t0.ml:7,12-22} G:"camlT0__get_one_0_2_code" 1 float)
    one/390 (app{t0.ml:9,12-22} G:"camlT0__get_one_0_2_code" 1 float)
    one/392 (app{t0.ml:11,12-22} G:"camlT0__get_one_0_2_code" 1 float)
    one/394 (app{t0.ml:13,12-22} G:"camlT0__get_one_0_2_code" 1 float)
    one/396 (app{t0.ml:15,12-22} G:"camlT0__get_one_0_2_code" 1 float)
    one/398 (app{t0.ml:17,12-22} G:"camlT0__get_one_0_2_code" 1 float)
    one/400 (app{t0.ml:19,12-22} G:"camlT0__get_one_0_2_code" 1 float))
   (+f
     (+f
       (+f
         (+f (+f (+f (+f (+f 0. one/386) one/388) one/390) one/392) one/394)
         one/396)
       one/398)
     one/400)))

The pattern is similar to the one discussed in #1783, even though it does not end up with an allocation.

Callee-saved registers, or a tweaked allocation strategy for small leaf functions may help. Layout polymorphism could also help, by making acc a reference (it can be done "manually" by defining a record with a mutable field with the specific layout). As noted in #1783, it might be possible to tweak to_cmm to avoid pushing the calls up.

ocaml-flambda / flambda-backend

Code generation for unboxed computations #2441