hirrolot / datatype99

Algebraic data types for C99
MIT License
1.38k stars 23 forks source link

Grammar railroad diagram #13

Closed mingodad closed 3 years ago

mingodad commented 3 years ago

With the transformed EBNF shown bellow from the README, we can copy and paste it on https://www.bottlecaps.de/rr/ui in the tab Edit Grammar then switch to the tab View Diagram to see the railroad diagram.

In doing so I noticed that the <arm> nonterminal is not defined.

datatype      ::= "datatype" "(" ( derive_clause "," )? datatype_name ( "," variant )+ ")" 
record        ::= "record" "(" ( derive_clause "," )? record_name ( "," field )* ")" 
datatype_name ::= ident 
record_name   ::= ident 

variant       ::= "(" variant_name ( "," type )* ")" 
field         ::= "(" type "," field_name ")" 
variant_name  ::= ident 
field_name    ::= ident 

derive_clause ::= "derive" "(" deriver_name ( "," deriver_name )* ")" 
deriver_name  ::= ident 

match         ::= "match" "(" lvalue ")" ( arm )+ 
matches       ::= "MATCHES" "(" expr "," ident ")" 
if_let        ::= "ifLet" "(" lvalue "," variant_name "," ident ( "," ident )* ")" stmt 
of            ::= "of" "(" variant_name ( "," ident )* ")" stmt 
otherwise     ::= "otherwise" stmt 
hirrolot commented 3 years ago

Nice catch, thank you. I've fixed it in the above commit.

mingodad commented 3 years ago

Also it seems that the <otherwise> nonterminal is not referenced on <match>, bellow is what I suspect is your intention:

datatype      ::= "datatype" "(" ( derive_clause "," )? datatype_name ( "," variant )+ ")"
record        ::= "record" "(" ( derive_clause "," )? record_name ( "," field )* ")"
datatype_name ::= ident
record_name   ::= ident

variant       ::= "(" variant_name ( "," type )* ")"
field         ::= "(" type "," field_name ")"
variant_name  ::= ident
field_name    ::= ident

derive_clause ::= "derive" "(" deriver_name ( "," deriver_name )* ")"
deriver_name  ::= ident

match         ::= "match" "(" lvalue ")" of+ otherwise?
matches       ::= "MATCHES" "(" expr "," ident ")"
if_let        ::= "ifLet" "(" lvalue "," variant_name "," ident ( "," ident )* ")" stmt
of            ::= "of" "(" variant_name ( "," ident )* ")" stmt
otherwise     ::= "otherwise" stmt
mingodad commented 3 years ago

And here I'm also including the _ for the of: Obs: edited to fix misplacement.

datatype      ::= "datatype" "(" ( derive_clause "," )? datatype_name ( "," variant )+ ")"
record        ::= "record" "(" ( derive_clause "," )? record_name ( "," field )* ")"
datatype_name ::= ident
record_name   ::= ident

variant       ::= "(" variant_name ( "," type )* ")"
field         ::= "(" type "," field_name ")"
variant_name  ::= ident
field_name    ::= ident

derive_clause ::= "derive" "(" deriver_name ( "," deriver_name )* ")"
deriver_name  ::= ident

match         ::= "match" "(" lvalue ")" of+ otherwise?
matches       ::= "MATCHES" "(" expr "," ident ")"
if_let        ::= "ifLet" "(" lvalue "," variant_name "," ident ( "," ident )* ")" stmt
of            ::= "of" "(" variant_name ( "," (ident | "_") )* ")" stmt
otherwise     ::= "otherwise" stmt
mingodad commented 3 years ago

It seems that the _ also apply to if_let, doesn't it ?

hirrolot commented 3 years ago

Yes, it does. _ can be used as a binding anywhere. I'll fix it now.

hirrolot commented 3 years ago

And here I'm also including the _ for the of:

This grammar is ambiguous: <ident> may be _ as well. So we can either define our special <ident> that is not equal to _ or just use an informal description. I think the latter is more convenient.

mingodad commented 3 years ago

Here is another one with a match_ident nonterminal that seems to accomplish the intended usage:

datatype      ::= "datatype" "(" ( derive_clause "," )? datatype_name ( "," variant )+ ")"
record        ::= "record" "(" ( derive_clause "," )? record_name ( "," field )* ")"
datatype_name ::= ident
record_name   ::= ident

variant       ::= "(" variant_name ( "," type )* ")"
field         ::= "(" type "," field_name ")"
variant_name  ::= ident
field_name    ::= ident

derive_clause ::= "derive" "(" deriver_name ( "," deriver_name )* ")"
deriver_name  ::= ident

match         ::= "match" "(" lvalue ")" of+ otherwise?
matches       ::= "MATCHES" "(" expr "," ident ")"
if_let        ::= "ifLet" "(" lvalue "," variant_name "," ( "," match_ident )+ ")" stmt
of            ::= "of" "(" variant_name ( "," match_ident)+ ")" stmt
otherwise     ::= "otherwise" stmt
match_ident ::= ident | "_"
hirrolot commented 3 years ago

Here, match_ident cannot be parsed unambiguously because again it might result in both an identifier and _. I.e., if we see _, we cannot understand whether it is "_" or ident.

mingodad commented 3 years ago

Here is a simple LL1 parser (https://github.com/mingodad/CocoR-CPP, https://github.com/mingodad/CocoR-Java, https://github.com/mingodad/CocoR-CSharp, ...) for the grammar you are describing :

#include "Scanner.nut"

COMPILER Datatype99

CHARACTERS
    letter    = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_".
    oct        = '0'..'7'.
    digit     = '0'..'9'.
    nzdigit    = '1'..'9'.
    cr        = '\r'.
    lf        = '\n'.
    tab       = '\t'.
    stringCh  = ANY - '"' - '\\' - cr - lf.
    charCh    = ANY - '\'' - '\\' - cr - lf.
    longStringCh = ANY - "]]".
    printable = '\u0020' .. '\u007e'.
    hex        = digit + 'a'..'f' + 'A'..'F'.

TOKENS
    ident     = letter { letter | digit }.

    TOK_NUMBER = ( '.' digit {digit} [('e'|'E')  ['+'|'-'] digit {digit}]
             | digit {digit} '.' {digit} [('e'|'E')  ['+'|'-'] digit {digit}]
             | digit {digit} [('e'|'E')  ['+'|'-'] digit {digit}]
             )
             .
/*
    TK_FLT = ( '.' digit {digit}
             | digit {digit} ['.' {digit}]
             )  [('e'|'E')  ['+'|'-'] digit {digit}]
             .
    TK_INT   = ( /*nz*/ digit {digit}
             //| '0' {oct}
             | ("0x"|"0X") hex {hex}
             )
             .
*/

    TOK_STRING    = '"' {stringCh |  '\\' (printable | ['\r'] '\n')} '"'
            | "'" { charCh | '\\' (printable | ['\r'] '\n')} "'"
            | "[[" {longStringCh} "]]"
            .

    badString = '"' {stringCh | '\\' printable} (cr | lf)
            | "'" {charCh | '\\' printable } (cr | lf)
            .

COMMENTS FROM "/*" TO "*/" NESTED
COMMENTS FROM "//" TO lf

IGNORE cr + lf + tab

PRODUCTIONS

Datatype99 =
    {declarations} EOF
    .

declarations =
    datatype
    | record
    | matches
    | match
    | if_let
    .

datatype =
    "datatype" "(" [ derive_clause "," ] datatype_name comma_variant {comma_variant} ")" ";"
    .

comma_variant =
    "," variant
    .

record =
    "record" "(" [ derive_clause "," ] record_name { "," field } ")" ";"
    .

variant =
    "(" variant_name { "," type } ")"
    .

field =
    "(" type "," field_name ")"
    .

derive_clause =
    "derive" "(" deriver_name { "," deriver_name } ")"
    .

match =
    "match" "(" lvalue ")" "{" {of} [otherwise] "}"
    .

matches =
    "MATCHES" "(" expr "," ident ")"
    .

if_let =
    "ifLet" "(" lvalue "," variant_name comma_match_ident {comma_match_ident} ")" (stmt | ";")
    .

of =
    "of" "(" variant_name {comma_match_ident} ")" stmt
    .

comma_match_ident =
    "," match_ident
    .

otherwise =
    "otherwise" stmt
    .

match_ident =
    ident | "_"
    .

type =
    ident {ident} {"*"}
    .

datatype_name =
    ident
    .

record_name =
    ident
    .

variant_name =
    ident
    .

field_name =
    ident
    .

deriver_name =
    ident
    .

lvalue =
    {"*"} ident
    .

expr =
    {"*"} ident
    .

stmt =
    ident (
        {ident}
        | "(" {ANY} ")"
        | "=" (TOK_NUMBER | ident)
        ) ';'
    | "return" ANY {ANY} ";"
    | match
    | if_let
    | "{" {stmt} "}"
    .

END Datatype99.

And here is the generated EBNF that can be viewed at https://www.bottlecaps.de/rr/ui :

//
// EBNF generated by CocoR parser generator to be viewed with https://www.bottlecaps.de/rr/ui
//

//
// productions
//

Datatype99 ::= ( declarations )* EOF 
declarations ::= ( datatype | record | matches | match | if_let ) 
datatype ::= "datatype" "(" ( derive_clause "," )? datatype_name comma_variant ( comma_variant )* ")" ";" 
record ::= "record" "(" ( derive_clause "," )? record_name ( "," field )* ")" ";" 
matches ::= "MATCHES" "(" expr "," ident ")" 
match ::= "match" "(" lvalue ")" "{" ( of )* ( otherwise )? "}" 
if_let ::= "ifLet" "(" lvalue "," variant_name comma_match_ident ( comma_match_ident )* ")" ( stmt | ";" ) 
derive_clause ::= "derive" "(" deriver_name ( "," deriver_name )* ")" 
datatype_name ::= ident 
comma_variant ::= "," variant 
variant ::= "(" variant_name ( "," type )* ")" 
record_name ::= ident 
field ::= "(" type "," field_name ")" 
variant_name ::= ident 
type ::= ident ( ident )* ( "*" )* 
field_name ::= ident 
deriver_name ::= ident 
lvalue ::= ( "*" )* ident 
of ::= "of" "(" variant_name ( comma_match_ident )* ")" stmt 
otherwise ::= "otherwise" stmt 
expr ::= ( "*" )* ident 
comma_match_ident ::= "," match_ident 
stmt ::= ( ident ( ( ident )* | "(" ( ANY )* ")" | "=" ( TOK_NUMBER | ident ) ) ";" | "return" ANY ( ANY )* ";" | match | if_let | "{" ( stmt )* "}" ) 
match_ident ::= ( ident | "_" ) 

//
// tokens
//

And here is the test file:

// clang-format off
datatype(
    derive(Menu),
    UserCommand,
    (SendMessage, MessageContent, UserId),
    (SubscribeToChannel, ChannelId),
    (DeleteAccount)
);
// clang-format on

// clang-format off
datatype(
    derive(Metadata),
    Num,
    (Char, char),
    (Int, int),
    (Double, double)
);
// clang-format on

// clang-format off
datatype(
    derive(Print),
    MyType,
    (Foo, const char *),
    (Bar, int, int)
);
// clang-format on

// clang-format off
record(
    derive(Metadata),
    Point,
    (int, x),
    (int, y)
);
// clang-format on

        ifLet(expr, C, str, x) {
            assert(str == &expr.data.C._0);
            assert(x == &expr.data.C._1);
            goto end_if_let;
        }

    ifLet(expr, B, _) FAIL;

        ifLet(b, B, _);
        ifLet(b, B, _);
     ifLet(expr, B, _) {}

datatype(
    Foo,
    (MkFoo, MyArray)
);

// clang-format off
datatype(
    Expr,
    (Const, double),
    (Add, Expr *, Expr *),
    (Sub, Expr *, Expr *),
    (Mul, Expr *, Expr *),
    (Div, Expr *, Expr *)
);
// clang-format on

    match(*expr) {
        of(Const, number) return *number;
        of(Add, lhs, rhs) return eval(*lhs) + eval(*rhs);
        of(Sub, lhs, rhs) return eval(*lhs) - eval(*rhs);
        of(Mul, lhs, rhs) return eval(*lhs) * eval(*rhs);
        of(Div, lhs, rhs) return eval(*lhs) / eval(*rhs);
    }

// clang-format off
datatype(
    BinaryTree,
    (Leaf, int),
    (Node, BinaryTree *, int, BinaryTree *)
);
// clang-format on

    match(*expr) {
        of(Const, number) return *number;
        of(Add, lhs, rhs) return eval(*lhs) + eval(*rhs);
        of(Sub, lhs, rhs) return eval(*lhs) - eval(*rhs);
        of(Mul, lhs, rhs) return eval(*lhs) * eval(*rhs);
        of(Div, lhs, rhs) return eval(*lhs) / eval(*rhs);
    }

    match(*tree) {
        of(Leaf, x) {
            free(tree);
            return;
        }
        of(Node, lhs, x, rhs) {
            destroy_tree(*lhs);
            destroy_tree(*rhs);
            free(tree);
        }
    }

    match(*tree) {
        of(Leaf, x) return *x;
        of(Node, lhs, x, rhs) return sum(*lhs) + *x + sum(*rhs);
    }

// clang-format off
datatype(
    Token,
    (Ident, const char *),
    (Int, int),
    (LParen),
    (RParen),
    (Plus)
);
// clang-format on

    match(token) {
        of(Ident, ident) printf("%s", *ident);
        of(Int, x) printf("%d", *x);
        of(LParen) printf("(");
        of(RParen) printf(")");
        of(Plus) printf(" + ");
    }

datatype(Trivial1, (Trivial1A));
datatype(Trivial2, (Trivial2A));
datatype(Trivial3, (Trivial3A), (Trivial3B), (Trivial3C));

// clang-format off
datatype(
    Complex,
    (A),
    (B, int),
    (C, const char *, long double),
    (D, char, unsigned, long long, int *)
);
// clang-format on

MATCHES(a, A)

    match(a) {
        otherwise {
            foo = 123;
        }
    }

    // Test a nested `match`.
    match(a) {
        of(A) {
            match(b) {
                of(B) {
                    foo = 34;
                }
                otherwise assert(false);
            }
        }
        otherwise assert(false);
    }

        match(expr) {
            of(A) {
                assert(false);
            }
            of(B, _) {
                assert(false);
            }
            of(C, _, x) {
                assert(124.1404 == *x);
            }
            of(D, c, _, _, ptr) {
            }
        }