soasme / PeppaPEG

PEG Parser in ANSI C
https://soasme.com/PeppaPEG
MIT License
55 stars 7 forks source link

Grammar to parse tree-siter grammars #147

Open mingodad opened 2 years ago

mingodad commented 2 years ago

People writing grammars for https://github.com/tree-sitter/tree-sitter need to use a ugly JavaScript spaghetti to write grammars and convert then to a json that the actual parser uses to do it's job.

I've been looking for a simple way to write the grammars in a style like yacc/bison/peg and then the tool would write the final json for the tree-sitter parser, I think that PeppaPEG would be a nice fit for this job and for that we need read the actual json grammars and convert to an augmented EBNF then people would write/extend on that EBNF and the tool would again write back the json.

A feature like this one would improve the visibility and user base of PeppaPEG.

Example here is a Lua grammar for tree-sitter https://github.com/Azganoth/tree-sitter-lua/blob/master/grammar.js and here the final json https://github.com/Azganoth/tree-sitter-lua/blob/master/src/grammar.json and here an initial EBNF grammar :

%name lua
%extras
    comment
    /[\s\n]/
    ;
%conflicts
    [ _prefix ]
    [ _expression _variable_declarator ]
    [ _expression function_call_statement ]
    [ function_name function_name_field ]
    ;
%externals
    comment
    string
    ;
%inline
    _statement
    ;
program :  _statement*  return_statement?  ;
return_statement :  "return"  (  _expression (  ","  _expression ) *  ) ?  _empty_statement?  ;
_statement : expression |  variable_declaration |  local_variable_declaration |  do_statement |  if_statement |  while_statement |  repeat_statement |  for_statement |  for_in_statement |  goto_statement |  break_statement |  label_statement |  _empty_statement | function | local_function | function_call ;
variable_declaration :  ( variable_declarator (  "," variable_declarator ) *  )  "="  (  _expression (  ","  _expression ) *  )  ;
local_variable_declaration :  "local" variable_declarator (  "="  (  _expression (  ","  _expression ) *  )  ) ?  ;
_variable_declarator :  identifier |  (  _prefix "["  _expression "]"  )  |  field_expression ;
field_expression :  _prefix "." property_identifier ;
_local_variable_declarator :  identifier (  ","  identifier ) *  ;
do_statement :  "do"  _statement*  return_statement?  "end"  ;
if_statement :  "if" condition_expression "then"  _statement*  return_statement?  elseif*  else?  "end"  ;
elseif :  "elseif" condition_expression "then"  _statement*  return_statement?  ;
else :  "else"  _statement*  return_statement?  ;
while_statement :  "while" condition_expression "do"  _statement*  return_statement?  "end"  ;
repeat_statement :  "repeat"  _statement*  return_statement?  "until" condition_expression ;
for_statement :  "for" loop_expression "do"  _statement*  return_statement?  "end"  ;
for_in_statement :  "for" loop_expression "do"  _statement*  return_statement?  "end"  ;
_loop_expression :  identifier "="  _expression ","  _expression (  ","  _expression ) ?  ;
_in_loop_expression :  (  identifier (  ","  identifier ) *  )  "in"  (  _expression (  ","  _expression ) *  )  ;
goto_statement :  "goto"  identifier ;
break_statement :  "break"  ;
label_statement :  "::"  identifier "::"  ;
_empty_statement :  ";"  ;
function_statement :  "function"  function_name _function_body ;
local_function_statement :  "local"  "function"  identifier _function_body ;
function_call_statement :   (  ( (  _prefix arguments )  |  (  _prefix ":" method arguments )  )  )  ;
arguments :  (  "("  (  _expression (  ","  _expression ) *  ) ?  ")"  )  |  table |  string ;
function_name :  ( identifier |  function_name_field )  (  ":" method ) ?  ;
function_name_field :  (  identifier )  (  "." property_identifier ) *  ;
parameters :  "("  (  ( self |  spread |  identifier )  (  ","  identifier ) *  (  ","  spread ) ?  ) ?  ")"  ;
_function_body :  parameters _statement*  return_statement?  "end"  ;
_expression :  spread |  _prefix |  next |  function_definition |  table |  binary_operation |  unary_operation |  string |  number |  nil |  true |  false |  identifier ;
spread :  "..."  ;
self :  "self"  ;
next :  "next"  ;
global_variable :  "_G"  |  "_VERSION"  ;
_prefix :  self |  global_variable |  _variable_declarator |   ( function_call )  |  (  "("  _expression ")"  )  ;
function_definition :  "function"  _function_body ;
table :  "{"  _field_sequence?  "}"  ;
field :  (  "["  _expression "]"  "="  _expression )  |  (  identifier "="  _expression )  |  _expression ;
_field_sequence :   (  (  field (  _field_sep field ) *  _field_sep?  )  )  ;
_field_sep :  ","  |  ";"  ;
binary_operation :   (  (  _expression "or"  _expression )  )  |   (  (  _expression "and"  _expression )  )  |   (  (  _expression "<"  _expression )  )  |   (  (  _expression "<="  _expression )  )  |   (  (  _expression "=="  _expression )  )  |   (  (  _expression "~="  _expression )  )  |   (  (  _expression ">="  _expression )  )  |   (  (  _expression ">"  _expression )  )  |   (  (  _expression "|"  _expression )  )  |   (  (  _expression "~"  _expression )  )  |   (  (  _expression "&"  _expression )  )  |   (  (  _expression "<<"  _expression )  )  |   (  (  _expression ">>"  _expression )  )  |   (  (  _expression "+"  _expression )  )  |   (  (  _expression "-"  _expression )  )  |   (  (  _expression "*"  _expression )  )  |   (  (  _expression "/"  _expression )  )  |   (  (  _expression "//"  _expression )  )  |   (  (  _expression "%"  _expression )  )  |   (  (  _expression ".."  _expression )  )  |   (  (  _expression "^"  _expression )  )  ;
unary_operation :   (  (  ( "not"  |  "#"  |  "-"  |  "~"  )  _expression )  )  ;
number :  (  ( ( (  ( "0"  |  (  "0" ?  /[1-9]/ /[0-9]+/?  )  )  "."  /[0-9]+/?  (  ( "e"  |  "E"  )  (  ( "-"  |  "+"  ) ?  /[0-9]+/ )  ) ?  )  |  (  "."  /[0-9]+/ (  ( "e"  |  "E"  )  (  ( "-"  |  "+"  ) ?  /[0-9]+/ )  ) ?  )  |  (  ( "0"  |  (  "0" ?  /[1-9]/ /[0-9]+/?  )  )  (  ( "e"  |  "E"  )  (  ( "-"  |  "+"  ) ?  /[0-9]+/ )  ) ?  )  )  |  (  ( "0x"  |  "0X"  )  /[a-fA-F0-9]+/ (  "."  /[a-fA-F0-9]+/ ) ?  (  ( "p"  |  "P"  )  (  ( "-"  |  "+"  ) ?  /[0-9]+/ )  ) ?  )  )  )  ;
nil :  "nil"  ;
true :  "true"  ;
false :  "false"  ;
identifier :  /[a-zA-Z_][a-zA-Z0-9_]*/ ;

Generated by this script (using quickjs https://bellard.org/quickjs/):

import * as std from "std";
import * as os from "os";

//print(typeof process);
//if(typeof std == "undefined") {
//}

const fname_list = [
    "tree-sitter-lua/src/grammar.json",
];

//const fname_base = "A_grammars/tree-sitter/";
const fname_base = "../tree-sitter/";

function parseJsonGrammar(fname)
{
    let fd = std.open(fname_base + fname, "r");
    let json = fd.readAsString();
    fd.close();

    let out_fname = fname.replace(/\/.+/, ".ebnf0");
    //print(out_fname); return;
    //fd = std.open(out_fname, "w");
    fd = std.open("/dev/stdout", "w"); //std.stdout;

    json = JSON.parse(json);
    //print(json);

    fd.printf("%%name %s\n", json.name);
    if(json.word) fd.printf("%%word %s\n", json.word);

    let manageTuples = function(dict) {
        for(var idx in  dict) {
            let value =  dict[idx];
            //print(value);
            switch(value.type) {
                case "SYMBOL":
                    fd.printf("\t%s\n", value.name);
                    break;
                case "PATTERN":
                    fd.printf("\t/%s/\n", value.value);
                    break;
                case "STRING":
                    fd.printf("\t'%s'\n", value.value);
                    break;
                case "TOKEN":
                    fd.printf("\t'%s'\n", value.content);
                    break;
                default:
                    throw("Unmanaged type " + value.type);
            }
        }
    };

    let manageArrays = function(ary) {
        for(var idx in  ary) {
            let value =  ary[idx];
            fd.printf("\t[");
            for(var idx2 in value) {
                fd.printf(" %s", value[idx2]);
            }
            fd.printf(" ]\n");
        }
    };

    if( json.extras && json.extras.length ) {
        fd.printf("%%extras\n");
        manageTuples(json.extras);
        fd.printf("\t;\n");
    }

    if( json.conflicts && json.conflicts.length ) {
        fd.printf("%%conflicts\n");
        manageArrays(json.conflicts);
        fd.printf("\t;\n");
    }

    if( json.externals && json.externals.length ) {
        fd.printf("%%externals\n");
        manageTuples(json.externals);
        fd.printf("\t;\n");
    }

    if( json.precedences  && json.precedences.length) {
        fd.printf("%%precedences\n");
        for(var idx in  json.precedences) {
            let value =  json.precedences[idx];
            fd.printf("\t[");
            for(var idx2 in value) {
                let pval = value[idx2];
                switch(pval.type) {
                    case "SYMBOL":
                        fd.printf(" %s", pval.name);
                        break;
                    case "STRING":
                        fd.printf(" '%s'", pval.value);
                        break;
                    default:
                        throw("Unmanaged type " + pval.type);
                }
            }
            fd.printf(" ]\n");
        }
        fd.printf("\t;\n");
    }

    if( json.inline  && json.inline.length ) {
        fd.printf("%%inline\n");
        for(var idx in  json.inline) {
            let value =  json.inline[idx];
            fd.printf("\t%s\n", value);
        }
        fd.printf("\t;\n");
    }

    if( json.supertypes  && json.supertypes.length ) {
        fd.printf("%%supertypes\n");
        for(var idx in  json.supertypes) {
            let value =  json.supertypes[idx];
            fd.printf("\t%s\n", value);
        }
        fd.printf("\t;\n");
    }

    let manageAlias = function (rules, alias_list) {
        for(var idx in  rules) {
            let rhs = rules[idx2];
            for(var idx2 in  rhs) {

            }
        }
    };
    let alias_list = {};

    let str_list = [];

    let manageRule = function (name, rule, depth) {
        //print(name, rule.type); //, typeof rule);
        switch(rule.type)
        {
            case "ALIAS":
                //fd.printf(" ( ");
                //manageRule(rule.type, rule.content, depth+1);
                //fd.printf(" )@%s ", rule.value);
                fd.printf("%s", rule.value);
                //alias_list[rule.value] = ;
            break;
            case "BLANK":
                print(rule.type);
            break;
            case "CHOICE": {
                let members = rule.members;
                let mcount = members.length;
                let isOptional = mcount == 2 && members[1].type == "BLANK";
                if(isOptional) {
                    //if(mcount > 2 && depth) fd.printf(" (");
                    manageRule(rule.type, members[0], depth+1);
                    //if(mcount > 2 && depth) fd.printf(" )");
                    fd.printf("? ");
                }
                else
                {
                    //fd.printf("=== %d : %d : %d ===", mcount, depth, mcount > 1 && depth);
                    if(mcount > 1 && depth) fd.printf(" (");
                    for(var idx in members) {
                        if(idx > 0) fd.printf(" | ");
                        manageRule(rule.type, members[idx], depth+1);
                    }
                    if(mcount > 1 && depth) fd.printf(" ) ");
                }
            }
            break;
            case "FIELD":
                //print(rule.type, rule.name);
                fd.printf(" ( ");
                manageRule(rule.type, rule.content, depth+1);
                fd.printf(" ) ");
            break;
            case "IMMEDIATE_TOKEN":
                fd.printf(" ( ");
                manageRule(rule.type, rule.content, depth+1);
                fd.printf(" ) ");
            break;
            case "PATTERN": {
                fd.printf(" /%s/", rule.value);
            }
            break;
            case "PREC":
            case "PREC_DYNAMIC":
            case "PREC_LEFT":
            case "PREC_RIGHT":
                fd.printf("  ( ");
                manageRule(rule.type, rule.content, depth+1);
                fd.printf(" ) ");
            break;
            case "REPEAT":
                //if(depth) fd.printf(" ( ");
                manageRule(rule.type, rule.content, depth+1);
                //if(depth) fd.printf(" )");
                fd.printf("* ");
            break;
            case "REPEAT1":
                //if(depth) fd.printf(" (");
                manageRule(rule.type, rule.content, depth+1);
                //if(depth) fd.printf(" )");
                fd.printf("+ ");
            break;
            case "SEQ": {
                let members = rule.members;
                let mcount = members.length;
                //fd.printf("=== %d : %d ===", mcount, depth);
                if(mcount > 1 && depth) fd.printf(" ( ");
                for(var idx in members) {
                    manageRule(rule.type, members[idx], depth+1);
                }
                if(mcount > 1 && depth) fd.printf(" ) ");
            }
            break;

            case "STRING": {
                let value = rule.value;
                //print(rule.type, value);
                switch(value) {
                    case "\0": fd.printf(" '\\0' "); break;
                    case "\b": fd.printf(" '\\b' "); break;
                    case "\f": fd.printf(" '\\f' "); break;
                    case "\n": fd.printf(" '\\n' "); break;
                    case "\r": fd.printf(" '\\r' "); break;
                    case "\t": fd.printf(" '\\t' "); break;
                    case "\v": fd.printf(" '\\v' "); break;
                    case "\\": fd.printf(" '\\\\' "); break;
                    case "'": fd.printf(" \"'\" "); break;
                    case "\"": fd.printf(" '\"' "); break;
                    default:
                        //value = value.replace(/\\/g, "\\\\");
                        value = value.replace(/\t/g, "\\t"); //order matter
                        if(value.indexOf("'") >= 0) fd.printf(" \"%s\" ", value);
                        else fd.printf(" \"%s\" ", value);
                }
            }
            break;

            case "SYMBOL":
                //print(rule.type, rule.name);
                fd.printf(" %s", rule.name);
            break;

            case "TOKEN":
                fd.printf(" ( ");
                manageRule(rule.type, rule.content, depth+1);
                fd.printf(" ) ");
            break;

            default:
                throw("Unknown rule type: " + rule.type);
        }
    }

    let rules = json.rules;
    for(var idx in rules) {
        let rule = rules[idx];
        //print(rule);
        if(rule.type == "xTOKEN") {
            let str = idx + " : " + + " .";
        }
        else {
            fd.printf("%s : ", idx);
            manageRule(idx, rule, 0);
        }
        fd.printf(" ;\n");
    }
    fd.close();
}

for(let idx in fname_list)
{
    let fname = fname_list[idx];
    print(fname);
    parseJsonGrammar(fname);
}