Open csabahruska opened 5 years ago
Yes, for example, here is an example of factorial (taken from bench/factorial-worker-wrapper-optimised.stg
) from commit:
https://github.com/bollu/simplexhc-cpp/tree/a40f6ab299e0f4c9e0b195512422b7ee5855c950:
╭─bollu@strangeattractor ~/work/simplexhc-cpp/bench ‹master›
╰─$ cat factorial-worker-wrapper-optimised.stg
# RUN: %loadSimplexhc %s --jit | FileCheck %s
# CHECK: 120
binding factorial = \(a : PrimInt) -> PrimInt {
case a () of
0 -> 1;
x -> case primSubtract (x 1) of
xmin1 -> case factorial (xmin1) of
factxmin1 -> primMult (x factxmin1);;;
};
binding main = \() -> Boxed {
case factorial (5) of
x -> printInt (x);
};
Execute:
╭─bollu@strangeattractor ~/work/simplexhc-cpp/bench ‹master›
╰─$ ../build/sxhc factorial-worker-wrapper-optimised.stg -O 3 --emit-llvm > fact.ll
and then view the IR:
define fastcc void @main(i8* nocapture readnone) #0 {
entry:
tail call void @printOnlyInt(i64 120)
%nReturnFrames.i.i = load i64, i64* @stackReturnTop, align 8, !invariant.group !0
%haveReturnFrames.i.i = icmp ugt i64 %nReturnFrames.i.i, 1
br i1 %haveReturnFrames.i.i, label %callnextfn.i.i, label %"case_factorial(atom-5)_alts.exit"
callnextfn.i.i: ; preds = %entry
%next_cont.i.i = tail call fastcc i8* @popReturn()
%cont_slot.i.i.i = bitcast i8* %next_cont.i.i to void (i8*)**
%cont.i.i.i = load void (i8*)*, void (i8*)** %cont_slot.i.i.i, align 8, !invariant.group !0
tail call fastcc void %cont.i.i.i(i8* %next_cont.i.i)
br label %"case_factorial(atom-5)_alts.exit"
"case_factorial(atom-5)_alts.exit": ; preds = %entry, %callnextfn.i.i
ret void
}
which has been reduced to a constant printInt 120
call and the final epilogue cleanup of deciding to not continue any more.
This is already a tight loop. LLVM obviously can handle this case. But what I'd like to see an example where LLVM removes heap allocated thunks. Like how GRIN does in the example above. Can LLVM do the GRIN -> Optimized GRIN step? Have you seen any evidence for that?
Can you give me an example STG program (that does not use a primitive like Fupto
which implictly encodes the iterations) which you would like to see?
For most loopy programs where LLVM can see the loop (by walking through pointer indirections as well thanks to the invariant.group
metadata), it was able to optimise it)
I am not sure what kind of program you are interested in.
I suspect we can work together to create a case where GRIN can indeed beat LLVM --- is that of interest to you? It is of interest to me, since it will help me further my research on implementing passes that LLVM needs :)
Ok, I'll write a more specific example in STG. (soon)
I've translated the substract.stg example to grin: substract.stg:
data Int = IntCons (PrimInt);
binding sub = \(int_i: () -> Int, int_j: () -> Int) -> Int {
case int_i () of
IntCons (iprim) -> case int_j () of
IntCons(jprim) -> case primSubtract (iprim jprim) of
r -> IntCons (r);;;
};
binding three = \() -> Int { IntCons (3) };
binding four = \() -> Int { IntCons (4) };
binding three_sub_four = \() -> Int { sub (three, four) };
binding main = \() -> Boxed {.
case three_sub_four () of
IntCons (x) -> printInt (x);.
};
substract.grin:
sub s1 s2 =
(CInt si1) <- eval s1
(CInt si2) <- eval s2
si3 <- _prim_int_sub si1 si2
pure (CInt si3)
grinMain =
t1 <- store (CInt 3)
t2 <- store (CInt 4)
t3 <- store (Fsub t1 t2)
(CInt r) <- eval t3
_prim_int_print r
eval q = v <- fetch q
case v of
(CInt x1) -> pure v
(Fsub x2 x3) ->
x4 <- sub x2 x3
update q x4
pure x4
then GRIN optimizer simplifies to:
grinMain =
si1.0 <- pure 3
si2.0 <- pure 4
si3.0 <- _prim_int_sub $ si1.0 si2.0
_prim_int_print $ si3.0
then LLVM generates:
grinMain: # @grinMain
.cfi_startproc
# %bb.0: # %grinMain.entry
movq $-1, %rdi
jmp _prim_int_print # TAILCALL
while sxhc subtract.stg -O 3 --emit-asm
does not compiles to constant printing. The generated code still has heap operations.
.text
.file "Module"
.globl init_rawmem_constructor
.p2align 4, 0x90
.type init_rawmem_constructor,@function
init_rawmem_constructor:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset %rbx, -16
movabsq $10737418240, %rbx
movq %rbx, %rdi
callq malloc
testq %rax, %rax
je .LBB0_2
movq %rax, rawHeapMemory(%rip)
movq $0, rawHeapMemoryTop(%rip)
popq %rbx
.cfi_def_cfa_offset 8
retq
.LBB0_2:
.cfi_def_cfa_offset 16
movl $.L__unnamed_1, %edi
movl $.L__unnamed_2, %esi
movl $.L__unnamed_3, %ecx
movq %rbx, %rdx
xorl %eax, %eax
callq printf
xorl %edi, %edi
callq fflush
ud2
.Lfunc_end0:
.size init_rawmem_constructor, .Lfunc_end0-init_rawmem_constructor
.cfi_endproc
.globl bumpPointerAllocator
.p2align 4, 0x90
.type bumpPointerAllocator,@function
bumpPointerAllocator:
movq rawHeapMemoryTop(%rip), %rax
addq $3, %rdi
andq $-4, %rdi
addq %rax, %rdi
movq %rdi, rawHeapMemoryTop(%rip)
addq rawHeapMemory(%rip), %rax
retq
.Lfunc_end1:
.size bumpPointerAllocator, .Lfunc_end1-bumpPointerAllocator
.globl popInt
.p2align 4, 0x90
.type popInt,@function
popInt:
movq stackIntTop(%rip), %rcx
movq stackInt-8(,%rcx,8), %rax
decq %rcx
movq %rcx, stackIntTop(%rip)
retq
.Lfunc_end2:
.size popInt, .Lfunc_end2-popInt
.globl pushInt
.p2align 4, 0x90
.type pushInt,@function
pushInt:
movq stackIntTop(%rip), %rax
movq %rdi, stackInt(,%rax,8)
incq %rax
movq %rax, stackIntTop(%rip)
retq
.Lfunc_end3:
.size pushInt, .Lfunc_end3-pushInt
.globl popReturn
.p2align 4, 0x90
.type popReturn,@function
popReturn:
movq stackReturnTop(%rip), %rcx
movq stackReturn-8(,%rcx,8), %rax
decq %rcx
movq %rcx, stackReturnTop(%rip)
retq
.Lfunc_end4:
.size popReturn, .Lfunc_end4-popReturn
.globl pushReturn
.p2align 4, 0x90
.type pushReturn,@function
pushReturn:
movq stackReturnTop(%rip), %rax
movq %rdi, stackReturn(,%rax,8)
incq %rax
movq %rax, stackReturnTop(%rip)
retq
.Lfunc_end5:
.size pushReturn, .Lfunc_end5-pushReturn
.globl popBoxed
.p2align 4, 0x90
.type popBoxed,@function
popBoxed:
movq stackBoxedTop(%rip), %rcx
movq stackBoxed-8(,%rcx,8), %rax
decq %rcx
movq %rcx, stackBoxedTop(%rip)
retq
.Lfunc_end6:
.size popBoxed, .Lfunc_end6-popBoxed
.globl pushBoxed
.p2align 4, 0x90
.type pushBoxed,@function
pushBoxed:
movq stackBoxedTop(%rip), %rax
movq %rdi, stackBoxed(,%rax,8)
incq %rax
movq %rax, stackBoxedTop(%rip)
retq
.Lfunc_end7:
.size pushBoxed, .Lfunc_end7-pushBoxed
.p2align 4, 0x90
.type primMultiply,@function
primMultiply:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset %rbx, -16
callq popInt
movq %rax, %rbx
callq popInt
imulq %rbx, %rax
movq %rax, %rdi
callq pushInt
callq popReturn
movq %rax, %rdi
popq %rbx
.cfi_def_cfa_offset 8
jmpq *(%rax)
.Lfunc_end8:
.size primMultiply, .Lfunc_end8-primMultiply
.cfi_endproc
.p2align 4, 0x90
.type primSubtract,@function
primSubtract:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset %rbx, -16
callq popInt
movq %rax, %rbx
callq popInt
subq %rax, %rbx
movq %rbx, %rdi
callq pushInt
callq popReturn
movq %rax, %rdi
popq %rbx
.cfi_def_cfa_offset 8
jmpq *(%rax)
.Lfunc_end9:
.size primSubtract, .Lfunc_end9-primSubtract
.cfi_endproc
.p2align 4, 0x90
.type primAdd,@function
primAdd:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset %rbx, -16
callq popInt
movq %rax, %rbx
callq popInt
leaq (%rax,%rbx), %rdi
callq pushInt
callq popReturn
movq %rax, %rdi
popq %rbx
.cfi_def_cfa_offset 8
jmpq *(%rax)
.Lfunc_end10:
.size primAdd, .Lfunc_end10-primAdd
.cfi_endproc
.p2align 4, 0x90
.type primMult,@function
primMult:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset %rbx, -16
callq popInt
movq %rax, %rbx
callq popInt
imulq %rbx, %rax
movq %rax, %rdi
callq pushInt
callq popReturn
movq %rax, %rdi
popq %rbx
.cfi_def_cfa_offset 8
jmpq *(%rax)
.Lfunc_end11:
.size primMult, .Lfunc_end11-primMult
.cfi_endproc
.p2align 4, 0x90
.type printIntDynamic,@function
printIntDynamic:
.cfi_startproc
pushq %rax
.cfi_def_cfa_offset 16
callq popInt
movq %rax, %rdi
callq printOnlyInt
cmpq $1, stackReturnTop(%rip)
jbe .LBB12_1
callq popReturn
movq %rax, %rdi
popq %rcx
.cfi_def_cfa_offset 8
jmpq *(%rax)
.LBB12_1:
.cfi_def_cfa_offset 16
popq %rax
.cfi_def_cfa_offset 8
retq
.Lfunc_end12:
.size printIntDynamic, .Lfunc_end12-printIntDynamic
.cfi_endproc
.p2align 4, 0x90
.type sub,@function
sub:
.cfi_startproc
pushq %r14
.cfi_def_cfa_offset 16
pushq %rbx
.cfi_def_cfa_offset 24
pushq %rax
.cfi_def_cfa_offset 32
.cfi_offset %rbx, -24
.cfi_offset %r14, -16
callq popBoxed
movq %rax, %rbx
callq popBoxed
movq %rax, %r14
movl $24, %edi
callq bumpPointerAllocator
movq $"case_alt_int_i()", (%rax)
movq %rbx, 8(%rax)
movq %r14, 16(%rax)
movq %rax, %rdi
callq pushReturn
movq %rbx, %rdi
addq $8, %rsp
.cfi_def_cfa_offset 24
popq %rbx
.cfi_def_cfa_offset 16
popq %r14
.cfi_def_cfa_offset 8
jmpq *(%rdi)
.Lfunc_end13:
.size sub, .Lfunc_end13-sub
.cfi_endproc
.p2align 4, 0x90
.type three,@function
three:
.cfi_startproc
pushq %rax
.cfi_def_cfa_offset 16
movl $16, %edi
callq bumpPointerAllocator
movq $0, (%rax)
movq $3, 8(%rax)
movq %rax, %rdi
callq pushBoxed
callq popReturn
movq %rax, %rdi
popq %rcx
.cfi_def_cfa_offset 8
jmpq *(%rax)
.Lfunc_end14:
.size three, .Lfunc_end14-three
.cfi_endproc
.p2align 4, 0x90
.type four,@function
four:
.cfi_startproc
pushq %rax
.cfi_def_cfa_offset 16
movl $16, %edi
callq bumpPointerAllocator
movq $0, (%rax)
movq $4, 8(%rax)
movq %rax, %rdi
callq pushBoxed
callq popReturn
movq %rax, %rdi
popq %rcx
.cfi_def_cfa_offset 8
jmpq *(%rax)
.Lfunc_end15:
.size four, .Lfunc_end15-four
.cfi_endproc
.p2align 4, 0x90
.type three_sub_four,@function
three_sub_four:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset %rbx, -16
movl $16, %edi
callq bumpPointerAllocator
movq $0, (%rax)
movq $3, 8(%rax)
movq %rax, %rdi
callq pushBoxed
callq popBoxed
movq 8(%rax), %rbx
movl $16, %edi
callq bumpPointerAllocator
movq $0, (%rax)
movq $4, 8(%rax)
movq %rax, %rdi
callq pushBoxed
callq popBoxed
subq 8(%rax), %rbx
movl $16, %edi
callq bumpPointerAllocator
movq $0, (%rax)
movq %rbx, 8(%rax)
movq %rax, %rdi
callq pushBoxed
callq popReturn
movq %rax, %rdi
popq %rbx
.cfi_def_cfa_offset 8
jmpq *(%rax)
.Lfunc_end16:
.size three_sub_four, .Lfunc_end16-three_sub_four
.cfi_endproc
.globl main
.p2align 4, 0x90
.type main,@function
main:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset %rbx, -16
movl $16, %edi
callq bumpPointerAllocator
movq $0, (%rax)
movq $3, 8(%rax)
movq %rax, %rdi
callq pushBoxed
callq popBoxed
movq 8(%rax), %rbx
movl $16, %edi
callq bumpPointerAllocator
movq $0, (%rax)
movq $4, 8(%rax)
movq %rax, %rdi
callq pushBoxed
callq popBoxed
subq 8(%rax), %rbx
movl $16, %edi
callq bumpPointerAllocator
movq $0, (%rax)
movq %rbx, 8(%rax)
movq %rax, %rdi
callq pushBoxed
callq popBoxed
movq 8(%rax), %rdi
callq printOnlyInt
cmpq $2, stackReturnTop(%rip)
jb .LBB17_1
callq popReturn
movq %rax, %rdi
popq %rbx
.cfi_def_cfa_offset 8
jmpq *(%rax)
.LBB17_1:
.cfi_def_cfa_offset 16
popq %rbx
.cfi_def_cfa_offset 8
retq
.Lfunc_end17:
.size main, .Lfunc_end17-main
.cfi_endproc
.globl mainStatic
.p2align 4, 0x90
.type mainStatic,@function
mainStatic:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset %rbx, -16
movl $16, %edi
callq bumpPointerAllocator
movq $0, (%rax)
movq $3, 8(%rax)
movq %rax, %rdi
callq pushBoxed
callq popBoxed
movq 8(%rax), %rbx
movl $16, %edi
callq bumpPointerAllocator
movq $0, (%rax)
movq $4, 8(%rax)
movq %rax, %rdi
callq pushBoxed
callq popBoxed
subq 8(%rax), %rbx
movl $16, %edi
callq bumpPointerAllocator
movq $0, (%rax)
movq %rbx, 8(%rax)
movq %rax, %rdi
callq pushBoxed
callq popBoxed
movq 8(%rax), %rdi
callq printOnlyInt
cmpq $2, stackReturnTop(%rip)
jb .LBB18_1
callq popReturn
movq %rax, %rdi
popq %rbx
.cfi_def_cfa_offset 8
jmpq *(%rax)
.LBB18_1:
.cfi_def_cfa_offset 16
popq %rbx
.cfi_def_cfa_offset 8
retq
.Lfunc_end18:
.size mainStatic, .Lfunc_end18-mainStatic
.cfi_endproc
.p2align 4, 0x90
.type "case_alt_int_i()",@function
"case_alt_int_i()":
.cfi_startproc
pushq %r14
.cfi_def_cfa_offset 16
pushq %rbx
.cfi_def_cfa_offset 24
pushq %rax
.cfi_def_cfa_offset 32
.cfi_offset %rbx, -24
.cfi_offset %r14, -16
movq 16(%rdi), %rbx
callq popBoxed
movq 8(%rax), %r14
movl $24, %edi
callq bumpPointerAllocator
movq $"case_alt_int_j()", (%rax)
movq %rbx, 8(%rax)
movq %r14, 16(%rax)
movq %rax, %rdi
callq pushReturn
movq %rbx, %rdi
addq $8, %rsp
.cfi_def_cfa_offset 24
popq %rbx
.cfi_def_cfa_offset 16
popq %r14
.cfi_def_cfa_offset 8
jmpq *(%rdi)
.Lfunc_end19:
.size "case_alt_int_i()", .Lfunc_end19-"case_alt_int_i()"
.cfi_endproc
.p2align 4, 0x90
.type "case_alt_int_j()",@function
"case_alt_int_j()":
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset %rbx, -16
movq 16(%rdi), %rbx
callq popBoxed
subq 8(%rax), %rbx
movl $16, %edi
callq bumpPointerAllocator
movq $0, (%rax)
movq %rbx, 8(%rax)
movq %rax, %rdi
callq pushBoxed
callq popReturn
movq %rax, %rdi
popq %rbx
.cfi_def_cfa_offset 8
jmpq *(%rax)
.Lfunc_end20:
.size "case_alt_int_j()", .Lfunc_end20-"case_alt_int_j()"
.cfi_endproc
.type nodebug,@object
.section .rodata,"a",@progbits
.globl nodebug
nodebug:
.byte 1
.size nodebug, 1
.type rawHeapMemory,@object
.bss
.globl rawHeapMemory
.p2align 3
rawHeapMemory:
.quad 0
.size rawHeapMemory, 8
.type rawHeapMemoryTop,@object
.globl rawHeapMemoryTop
.p2align 3
rawHeapMemoryTop:
.quad 0
.size rawHeapMemoryTop, 8
.section .ctors.65535,"aGw",@progbits,rawHeapMemory,comdat
.p2align 3
.quad init_rawmem_constructor
.type .L__unnamed_2,@object
.section .rodata.str1.1,"aMS",@progbits,1
.L__unnamed_2:
.asciz "malloc returned null for heap size: "
.size .L__unnamed_2, 37
.type .L__unnamed_3,@object
.L__unnamed_3:
.asciz "\n"
.size .L__unnamed_3, 2
.type .L__unnamed_1,@object
.L__unnamed_1:
.asciz "%s%ld%s"
.size .L__unnamed_1, 8
.type stackInt,@object
.bss
.globl stackInt
.p2align 4
stackInt:
.zero 8000000
.size stackInt, 8000000
.type stackIntTop,@object
.globl stackIntTop
.p2align 3
stackIntTop:
.quad 0
.size stackIntTop, 8
.type stackReturn,@object
.globl stackReturn
.p2align 4
stackReturn:
.zero 8000000
.size stackReturn, 8000000
.type stackReturnTop,@object
.globl stackReturnTop
.p2align 3
stackReturnTop:
.quad 0
.size stackReturnTop, 8
.type stackBoxed,@object
.globl stackBoxed
.p2align 4
stackBoxed:
.zero 8000000
.size stackBoxed, 8000000
.type stackBoxedTop,@object
.globl stackBoxedTop
.p2align 3
stackBoxedTop:
.quad 0
.size stackBoxedTop, 8
.type closure_primMultiply,@object
.section .rodata,"a",@progbits
.globl closure_primMultiply
.p2align 3
closure_primMultiply:
.quad primMultiply
.size closure_primMultiply, 8
.type closure_primSubtract,@object
.globl closure_primSubtract
.p2align 3
closure_primSubtract:
.quad primSubtract
.size closure_primSubtract, 8
.type closure_primAdd,@object
.globl closure_primAdd
.p2align 3
closure_primAdd:
.quad primAdd
.size closure_primAdd, 8
.type closure_primMult,@object
.globl closure_primMult
.p2align 3
closure_primMult:
.quad primMult
.size closure_primMult, 8
.type closure_printInt,@object
.globl closure_printInt
.p2align 3
closure_printInt:
.quad printIntDynamic
.size closure_printInt, 8
.type sub_closure,@object
.globl sub_closure
.p2align 3
sub_closure:
.quad sub
.size sub_closure, 8
.type three_closure,@object
.globl three_closure
.p2align 3
three_closure:
.quad three
.size three_closure, 8
.type four_closure,@object
.globl four_closure
.p2align 3
four_closure:
.quad four
.size four_closure, 8
.type three_sub_four_closure,@object
.globl three_sub_four_closure
.p2align 3
three_sub_four_closure:
.quad three_sub_four
.size three_sub_four_closure, 8
.type main_closure,@object
.globl main_closure
.p2align 3
main_closure:
.quad main
.size main_closure, 8
.section ".note.GNU-stack","",@progbits
Do you have any example when LLVM with pointer analysis can remove laziness as much as GRIN can?
i.e.