rochus-keller / EbnfStudio

EbnfStudio can be used to edit and analyze EBNF grammars.

GNU General Public License v2.0

62 stars 10 forks source link

Fix few warnings #2

Open mingodad opened 3 years ago

mingodad commented 3 years ago

When compiling on Ubuntu 18.04 I fix a few compiler warnings see bellow:

------------------------------- EbnfAnalyzer.cpp -------------------------------
index 5bbb7a5..fde2d13 100644
@@ -556,7 +556,7 @@ void EbnfAnalyzer::findAmbiguousAlternatives(Ast::Node* node, FirstFollowSet* se

             // TODO: each alternative might have a different predicate type LL or LA
             // currently just assume everything is ok if an LA predicate is present
-            if( predA && !predA->getLa().isEmpty() || predB && !predB->getLa().isEmpty() )
+            if( (predA && !predA->getLa().isEmpty()) || (predB && !predB->getLa().isEmpty()) )
                 continue;

             if( ll > 0 )

-------------------------------- EbnfErrors.cpp --------------------------------
index 02c27c6..23e15ea 100644
@@ -20,7 +20,7 @@
 #include "EbnfErrors.h"
 #include <QtDebug>

-EbnfErrors::EbnfErrors(QObject *parent) : QObject(parent),d_reportToConsole(false),d_errCounter(0)
+EbnfErrors::EbnfErrors(QObject *parent) : QObject(parent),d_errCounter(0),d_reportToConsole(false)
 {
     d_eventLatency.setSingleShot(true);
     connect(&d_eventLatency, SIGNAL(timeout()), this, SIGNAL(sigChanged()));

-------------------------------- EbnfLexer.cpp --------------------------------
index ccad2e1..67541a4 100644
@@ -24,7 +24,7 @@
 #include <QtDebug>

 EbnfLexer::EbnfLexer(QObject *parent) : QObject(parent),
-    d_lastToken(EbnfToken::Invalid),d_lineNr(0),d_colNr(0),d_in(0)
+    d_in(0), d_lineNr(0),d_colNr(0),d_lastToken(EbnfToken::Invalid)
 {

 }

-------------------------------- EbnfSyntax.cpp --------------------------------
index 121b9e3..3a408f0 100644
@@ -89,7 +89,7 @@ Ast::NodeSet EbnfSyntax::collectNodes(const Ast::NodeRefSet& pattern, const Ast:
     return res;
 }

-EbnfSyntax::EbnfSyntax(EbnfErrors* errs):d_finished(false),d_errs(errs)
+EbnfSyntax::EbnfSyntax(EbnfErrors* errs):d_errs(errs), d_finished(false)
 {

 }

--------------------------------- EbnfSyntax.h ---------------------------------
index 3d88de3..93f5e07 100644
@@ -84,9 +84,9 @@ namespace Ast
         Definition* d_def; // resolved nonterminal
         Node* d_parent; // TODO: ev. unnötig; man kann damit bottom up über Sequence hinweg schauen
         Node(Type t, Definition* d, const EbnfToken& tok = EbnfToken()):Symbol(tok),d_type(t),
-            d_quant(One),d_owner(d),d_def(0),d_parent(0),d_leftRecursive(false){}
+            d_quant(One),d_leftRecursive(false),d_owner(d),d_def(0),d_parent(0){}
         Node(Type t, Node* parent, const EbnfToken& tok = EbnfToken()):Symbol(tok),d_type(t),
-            d_quant(One),d_owner(parent->d_owner),d_def(0),d_parent(parent),d_leftRecursive(false){ parent->d_subs.append(this); }
+            d_quant(One),d_leftRecursive(false),d_owner(parent->d_owner),d_def(0),d_parent(parent){ parent->d_subs.append(this); }
         ~Node();
         bool doIgnore() const;
         bool isNullable() const;

--------------------------------- EbnfToken.h ---------------------------------
index 3ed23f9..de61938 100644
@@ -31,6 +31,7 @@ struct EbnfToken
         Sym(const Sym& rhs ):d_str(rhs.d_str){}
         Sym():d_str(0){}

+        Sym& operator=(const Sym &rhs) = default;
         operator QByteArray() const { return toBa(); }
         QByteArray toBa() const;
         QString toStr() const;
@@ -67,7 +68,7 @@ struct EbnfToken
     quint32 d_lineNr;
     Sym d_val; // utf-8
     EbnfToken(TokenType t = Invalid, quint32 line = 0,quint16 col = 0, quint16 len = 0, const QByteArray& val = QByteArray() ):
-        d_type(t),d_lineNr(line),d_colNr(col),d_len(len),d_op(Normal){ d_val = getSym(val);}
+        d_type(t),d_op(Normal),d_len(len),d_colNr(col),d_lineNr(line){ d_val = getSym(val);}
     QString toString(bool labeled = true) const;
     bool isValid() const { return d_type != Eof && d_type != Invalid; }
     bool isErr() const { return d_type == Invalid; }

--------------------------------- GenUtils.cpp ---------------------------------
index 12af2e4..b92eb99 100644
@@ -160,7 +160,7 @@ static bool lessThan( const QString& lhs, const QString& rhs )
 {
     const bool lhsAlnum = GenUtils::containsAlnum(lhs);
     const bool rhsAlnum = GenUtils::containsAlnum(rhs);
-    if( lhsAlnum && rhsAlnum || !lhsAlnum && !rhsAlnum )
+    if( (lhsAlnum && rhsAlnum) || (!lhsAlnum && !rhsAlnum) )
         return lhs < rhs;
     else
         return !lhsAlnum && rhsAlnum;

--------------------------------- LaParser.cpp ---------------------------------
index cf349c6..48757a0 100644
@@ -282,7 +282,7 @@ LaLexer::Tok LaLexer::nextTokenImp()
     return Tok(Tok::Eof);
 }

-int LaLexer::skipWhiteSpace()
+void LaLexer::skipWhiteSpace()
 {
     while( d_pos < d_str.size() && ::isspace(d_str[d_pos]) )
         d_pos++;

---------------------------------- LaParser.h ----------------------------------
index 3f1df44..c4b97bd 100644
@@ -40,7 +40,7 @@ public:
     Tok peekToken(quint8 lookAhead = 1);
 protected:
     Tok nextTokenImp();
-    int skipWhiteSpace();
+    void skipWhiteSpace();
     Tok ident();
     Tok literal();
     Tok index();

------------------------------- SyntaxTreeMdl.h -------------------------------
index aa84594..9ddce10 100644
@@ -49,7 +49,7 @@ private:
         const Ast::Symbol* d_sym;
         QList<Slot*> d_children;
         Slot* d_parent;
-        Slot(Slot* p = 0):d_parent(p){ if( p ) p->d_children.append(this); }
+        Slot(Slot* p = 0):d_sym(NULL), d_parent(p){ if( p ) p->d_children.append(this); }
         ~Slot() { foreach( Slot* s, d_children ) delete s; }
     };
     void fill(Slot* super, const Ast::Node* sym);

mingodad commented 3 years ago

Also this prevent segfaults due to syntax errors before attempt to generate anything form the Generate menu:

-------------------------------- MainWindow.cpp --------------------------------
index ee8156f..5976cc1 100644
@@ -396,10 +396,16 @@ void MainWindow::onExpandSelected()

 void MainWindow::onGenSynTree()
 {
+    const QString title = tr("Generate Syntax Tree");
     ENABLED_IF( !d_edit->getPath().isEmpty() );

     loadTokMap();
-    SynTreeGen::generateTree( d_edit->getPath(), d_edit->getSyntax() );
+    EbnfSyntax* syn = d_edit->getSyntax();
+    if(!syn) {
+        QMessageBox::critical(this,title,tr("Cannot generate syntax tree, fix the issues first !") );
+        return;
+    }
+    SynTreeGen::generateTree( d_edit->getPath(), syn );
 //    QSet<QByteArray> res = EbnfAnalyzer::collectAllTerminalStrings(d_edit->getSyntax());
 //    for( QSet<QByteArray>::const_iterator i = res.begin(); i != res.end(); ++i )
     //        qDebug() << (*i) << SynTreeGen::symToString((*i));
@@ -407,28 +413,45 @@ void MainWindow::onGenSynTree()

 void MainWindow::onGenTt()
 {
+    const QString title = tr("Generate Token Types");
     ENABLED_IF( !d_edit->getPath().isEmpty() );

     loadTokMap();
-    SynTreeGen::generateTt( d_edit->getPath(), d_edit->getSyntax(), true, true );
+    EbnfSyntax* syn = d_edit->getSyntax();
+    if(!syn) {
+        QMessageBox::critical(this,title,tr("Cannot generate token types, fix the issues first !") );
+        return;
+    }
+    SynTreeGen::generateTt( d_edit->getPath(), syn, true, true );
 }

 void MainWindow::onGenHtml()
 {
+    const QString title = tr("Generate Html");
     ENABLED_IF( !d_edit->getPath().isEmpty() );

+    EbnfSyntax* syn = d_edit->getSyntax();
+    if(!syn) {
+        QMessageBox::critical(this,title,tr("Cannot generate html, fix the issues first !") );
+        return;
+    }
     HtmlSyntax gen;
-    gen.generateHtml( d_edit->getPath(), d_edit->getSyntax() );
+    gen.generateHtml( d_edit->getPath(), syn );
 }

 void MainWindow::onGenCoco()
 {
+    const QString title = tr("Generate Coco/R");
     ENABLED_IF( !d_edit->getPath().isEmpty() );

     loadTokMap();
     CocoGen gen;
-    QFileInfo info(d_edit->getPath());
     EbnfSyntax* syn = d_edit->getSyntax();
+    if(!syn) {
+        QMessageBox::critical(this,title,tr("Cannot generate Coco/R, fix the issues first !") );
+        return;
+    }
+    QFileInfo info(d_edit->getPath());
     gen.generate( info.absoluteDir().absoluteFilePath( info.completeBaseName() + ".atg"), syn, d_tbl, true );
     SynTreeGen::generateTt( d_edit->getPath(), syn, true, false );
     SynTreeGen::generateTree( d_edit->getPath(), syn, true );
@@ -436,16 +459,28 @@ void MainWindow::onGenCoco()

 void MainWindow::onGenAntlr()
 {
+    const QString title = tr("Generate Antlr");
     ENABLED_IF( !d_edit->getPath().isEmpty() );
+    EbnfSyntax* syn = d_edit->getSyntax();
+    if(!syn) {
+        QMessageBox::critical(this,title,tr("Cannot generate Antlr, fix the issues first !") );
+        return;
+    }
     QFileInfo info(d_edit->getPath());
-    AntlrGen::generate( info.absoluteDir().absoluteFilePath( info.completeBaseName() + ".g"), d_edit->getSyntax() );
+    AntlrGen::generate( info.absoluteDir().absoluteFilePath( info.completeBaseName() + ".g"), syn );
 }

 void MainWindow::onGenLlgen()
 {
+    const QString title = tr("Generate Llgen");
     ENABLED_IF( !d_edit->getPath().isEmpty() );
+    EbnfSyntax* syn = d_edit->getSyntax();
+    if(!syn) {
+        QMessageBox::critical(this,title,tr("Cannot generate Llgen, fix the issues first !") );
+        return;
+    }
     QFileInfo info(d_edit->getPath());
-    LlgenGen::generate( info.absoluteDir().absoluteFilePath( info.completeBaseName() + ".g"), d_edit->getSyntax(), d_tbl );
+    LlgenGen::generate( info.absoluteDir().absoluteFilePath( info.completeBaseName() + ".g"), syn, d_tbl );
 }

 void MainWindow::onOutputFirstSet()

-------------------------------- SynTreeGen.cpp --------------------------------
index 65d0ea0..b587c8d 100644
@@ -29,6 +29,8 @@ bool SynTreeGen::generateTree(const QString& ebnfPath, EbnfSyntax* syn, bool inc
 {
     Q_ASSERT( syn != 0 );

+    if(!syn)
+        return false;
     const QByteArray nameSpace = syn->getPragmaFirst("%namespace");
     const QByteArray nameSpace2 = nameSpace.isEmpty() ? nameSpace : ( nameSpace + "::" );
     QByteArray module = syn->getPragmaFirst("%module");

rochus-keller commented 3 years ago

Thanks. Looks like rather cosmetic warnings by the compiler which I usually ignore or suppress (e.g. -Wno-reorder). The issues causing a segfault should be fixed of course.

mingodad commented 3 years ago

I removed all suppressing flags to see what's there and then fixed several of then.

mingodad commented 3 years ago

Looking at the EBNF syntax used by this project I noticed that you have some extensions that isn't documented, like this from LjsTools/syntax/LjAsm.ebnf:

desig ::= 
    [ \LL:2\ fname '.' ] // function name to uniquely identify source of upvalue
    vname 
...
From Oberon/syntax/Oberon.ebnf:

ArrayType ::= ( ARRAY

ifdef OBNX

| CARRAY

endif

ifdef BBOX

[ SysFlag ]

endif

ifdef OBN2

[ LengthList ]

else

LengthList

endif

 OF type

ifdef OBNX

 | '[' [ LengthList ] ']' type

endif


RET_ ::= RET [ \LA: 1:ident & 2:!':' \ desig [ posint ] ] // number of returns, leave out if 1

The \LL:k\ is briefly mentioned on the README , could you expand the description of the EBNF syntax accepted and if possible add a syntax.ebnf file describing it ?

Cheers !

mingodad commented 3 years ago

There is any reason to not allow nonterminals identifiers to start with '_' ?

In my clone I'm allowing it with the change shown bellow:

-------------------------------- EbnfLexer.cpp --------------------------------
index ccad2e1..9765991 100644
@@ -63,7 +63,7 @@ EbnfToken EbnfLexer::nextTokenImp()
         if( d_colNr == 0 && ch == '#' )
         {
             return ppsym();
-        }else if( ch.isLetterOrNumber() || ch == '$' || ch == '%' )
+        }else if( ch.isLetterOrNumber() || ch == '$' || ch == '%' || ch == '_')
         {
             // Identifier oder Reserved Word
             EbnfToken t = ident();

rochus-keller commented 3 years ago

that you have some extensions that isn't documented

Well, I wouldn't consider the project to be a complete project ready for everyones use; it's work in progress and I add features as I need them; there are also still bugs in the analyzer which I will debug and fix some day; and of course I also should write some documentation; currently there is no other way than to look in the source code; the syntax of the LA prefix is in LaParser.h.

EDIT: actually the analyzer/generator are not yet as intelligent as they could be; a lot of LA prefixes could be generated automatically from simple LL:k prefixes; some day I will improve it, but until then some of my grammars use rather lengthy LA prefixes as a work around.

There is any reason to not allow nonterminals identifiers to start with '_' ?

I have to check, don't remember. Maybe there could be some ambiguity in generated code. If you don't find any issues with underscore prefixed names, then there is likely no reason to not do so. Please note that I'm only using the Coco/R generator; all other generators are remains from earlier attempts with parser generators and don't even support all syntax features.

mingodad commented 3 years ago

Thank you for reply !

I'm looking to allow to us this project to also generate tree-sitter grammars see this https://github.com/tree-sitter/tree-sitter/issues/1013 and https://github.com/eatkins/tree-sitter-ebnf-generator, also export to view with https://www.bottlecaps.de/rr/ui .

Would be nice if we could generate a Lua parser with CocoR and with your knowledge of Lua we could have direct evaluation of the grammar on the EBNFStudio.

rochus-keller commented 3 years ago

Welcome. Note that I added an edit to my answer.

this project to also generate tree-sitter grammars

Not sure whether this works without a full redesign, because EbnfStudio is designed for LL(k). Tree sitter does LR which requires completely different analyzers/generators.

rochus-keller commented 3 years ago

Would be nice if we could generate a Lua parser with CocoR

That's actually already implemented. Have a look at https://github.com/rochus-keller/LjTools/blob/master/LuaLexer.cpp and https://github.com/rochus-keller/LjTools/blob/master/LuaParser.cpp.

mingodad commented 3 years ago

I'm looking at it right now and noticed that you have a reference to Luajit in LjBcViewer.pro to ../LuaJIT/src/LuaJit.pri that doesn't exists in the standard Luajit. Do you have any customization that you forgot to mention ?

include( ../LuaJIT/src/LuaJit.pri ){
    LIBS += -ldl
} else {
    LIBS += -lluajit
}

rochus-keller commented 3 years ago

../LuaJIT/src/LuaJit.pri

You can ignore that. The code means: "if you find LuaJit.pri, then use it and link with dl, otherwhise ignore it and link with the full luajit so".

Here is the pri file in case you also want to add LuaJIT in source form to the project: https://github.com/rochus-keller/LjTools/issues/1#issuecomment-552196264

mingodad commented 3 years ago

After playing a bit with CocoR I've got a simple/naive converter for the CocoR syntax to the EBNF accepted by https://www.bottlecaps.de/rr/ui and applied to a slight edited Lua.atg form Ljstools (you can copy and paste in the Edit Grammar tab and then switch to the View diagram tab), and found that you didn't manage precedence on that grammar (probably because you don't need validate the code).

//"--" lf cr  '+' lf  '+' tab
Lua ::= chunk
chunk ::=  ( stat  ( T_Semi  )?  )*  ( laststat  ( T_Semi  )?  )?
block ::= chunk
stat ::= assigOrCall_  | dostat_  | whilestat_  | repeatstat_  | ifstat_  | forstat_  | gfuncdecl_  | localdecl_
dostat_ ::= T_do block T_end
whilestat_ ::= T_while exp T_do block T_end
repeatstat_ ::= T_repeat block T_until exp
ifstat_ ::= T_if exp T_then block  ( T_elseif exp T_then block  )*  ( T_else block  )? T_end
forstat_ ::= T_for T_Name  ( T_Eq exp T_Comma exp  ( T_Comma exp  )?  |  ( T_Comma T_Name  )* T_in explist  ) T_do block T_end
gfuncdecl_ ::= T_function funcname funcbody
localdecl_ ::= T_local  ( lfuncdecl_  | lvardecl_  )
lfuncdecl_ ::= T_function T_Name funcbody
lvardecl_ ::= namelist  ( T_Eq explist  )?
assigOrCall_ ::= prefixexp  ( assignment_  )?
call_ ::=  ( T_Colon T_Name  )? args
assignment_ ::=  ( T_Comma prefixexp  )* T_Eq explist
laststat ::= T_return  ( explist  )?  | T_break
funcname ::= T_Name  ( desig_  )*  ( T_Colon T_Name  )?
namelist ::= T_Name  ( T_Comma T_Name  )*
explist ::= exp  ( T_Comma exp  )*
exp ::= T_nil exp_nlr_  | T_false exp_nlr_  | T_true exp_nlr_  | T_Number exp_nlr_  | T_String exp_nlr_  | T_3Dot exp_nlr_  | lambdecl_ exp_nlr_  | prefixexp exp_nlr_  | tableconstructor exp_nlr_  | unop exp exp_nlr_
exp_nlr_ ::=  ( binop exp exp_nlr_  )?
prefixexp ::=  ( T_Name  | T_Lpar exp T_Rpar  )  ( index_  | desig_  | call_  )*
index_ ::= T_Lbrack exp T_Rbrack
desig_ ::= T_Dot T_Name
args ::= T_Lpar  ( explist  )? T_Rpar  | tableconstructor  | T_String
lambdecl_ ::= T_function funcbody
funcbody ::= T_Lpar  ( parlist  )? T_Rpar block T_end
parlist ::= namelist  ( T_Comma T_3Dot  )?  | T_3Dot
tableconstructor ::= T_Lbrace  ( fieldlist  )? T_Rbrace
fieldlist ::= field  ( fieldsep field  )*  ( fieldsep  )?
field ::= index_ T_Eq exp  | T_Name T_Eq exp  | exp
fieldsep ::= T_Comma  | T_Semi
binop ::= T_Plus  | T_Minus  | T_Star  | T_Slash  | T_Hat  | T_Percent  | T_2Dot  | T_Lt  | T_Leq  | T_Gt  | T_Geq  | T_2Eq  | T_TildeEq  | T_and  | T_or
unop ::= T_Minus  | T_not  | T_Hash

letter ::= "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_"
digit ::= "0123456789"
cr ::= '\r'
lf ::= '\n'
tab ::= '\t'
stringCh ::= ANY  '-' '"'  '-' '\'  '-' cr  '-' lf
charCh ::= ANY  '-' "'"  '-' '\'  '-' cr  '-' lf
printable ::= '\u0020'  .. '\u007e'
hex ::= "0123456789abcdef"
T_Name  ::= letter  ( letter  | digit  )*
T_Number  ::= digit  ( digit  )*  ( '.' digit  ( digit  )*  )?
T_String  ::= '"'  ( stringCh  | '\' printable  )* '"'  | "'"  ( charCh  | '\' printable  )* "'"
badString  ::= '"'  ( stringCh  | '\' printable  )*  ( cr  | lf  )  | "'"  ( charCh  | '\' printable  )*  ( cr  | lf  )
T_Hash  ::= '#'
T_Percent  ::= '%'
T_Lpar  ::= '('
T_Rpar  ::= ')'
T_Star  ::= '*'
T_Plus  ::= '+'
T_Comma  ::= ','
T_Minus  ::= '-'
T_2Minus  ::= "--"
T_2MinusLbrack  ::= "--["
T_Dot  ::= '.'
T_2Dot  ::= ".."
T_3Dot  ::= "..."
T_Slash  ::= '/'
T_Colon  ::= ':'
T_Semi  ::= ';'
T_Lt  ::= '<'
T_Leq  ::= "<="
T_Eq  ::= '='
T_2Eq  ::= "=="
T_Gt  ::= '>'
T_Geq  ::= ">="
T_Lbrack  ::= '['
T_Rbrack  ::= ']'
T_Rbrack2Minus  ::= "]--"
T_Hat  ::= '^'
T_Lbrace  ::= '{'
T_Rbrace  ::= '}'
T_TildeEq  ::= "~="
T_and  ::= "and"
T_break  ::= "break"
T_do  ::= "do"
T_else  ::= "else"
T_elseif  ::= "elseif"
T_end  ::= "end"
T_false  ::= "false"
T_for  ::= "for"
T_function  ::= "function"
T_if  ::= "if"
T_in  ::= "in"
T_local  ::= "local"
T_nil  ::= "nil"
T_not  ::= "not"
T_or  ::= "or"
T_repeat  ::= "repeat"
T_return  ::= "return"
T_then  ::= "then"
T_true  ::= "true"
T_until  ::= "until"
T_while  ::= "while"

rochus-keller commented 3 years ago

I used the original grammar from https://www.lua.org/manual/5.1/manual.html#8 which declares all binary operations in the same production and terms/factors/primaries are all combined in the 'exp' production. That didn't bother me because I don't use it as a compiler frontend. If your goal is to have a tree-sitter parser then actually it neither should bother you. Otherwise the additional productions could easily be added to the grammer, or alternatively the precedence rules can be handled directly by the parser. There is no grammar suited for every purpose. If you need it to understand the language and draw a syntax diagram you don't have to bother whether its LL(1). Grammars optimized for a specific parser generator are rarely beautiful.

mingodad commented 3 years ago

Thank you for reply ! I'm not looking at beauty with the railroad diagram, I'm looking for help to visualize/understand the grammar. And I'm also trying to create a validating Lua grammar in CocoR, that's why I did the comment about precedence.

rochus-keller commented 3 years ago

The idea of EbnfStudio is to have a "pure" grammar (i.e. not polluted with implementation specific parser code). That's why all my prefixes are implementation independent. The output of the parser is a non-abstract syntax tree; only this tree is subject to validation and AST construction. If you don't care for a pure grammar or need more powerful features directly in the parser (to save an additional phase and tree) you better directly work with Coco/R (or the more powerful ANTLR); EbnfStudio doesn't help you in this case.

mingodad commented 3 years ago

I just found this variation of CocoR http://cocos-parsergen.sourceforge.net/index.html it seems to achieve a similar goal to your "pure" grammar.

rochus-keller commented 3 years ago

Looks interesting, thanks. The philosophy is a bit different, and I couldn't make use of it because I need C++. Tight integration with an IDE and automatic generation of a generic syntax tree are yet two other features I would miss.

mingodad commented 3 years ago

I've got a conversion from CocoR CSharp to Lua using https://github.com/yanghuan/CSharp.lua and the result is here https://github.com/yanghuan/CSharp.lua/files/6521486/Coco-lua.zip , it needs going through it and fix/simplify several things but overall the big picture is there in Lua.

rochus-keller commented 3 years ago

Cool.

mingodad commented 3 years ago

I made a fix for left recursion detection here https://github.com/SSW-CocoR/CocoR-CPP/pull/2 probably you'll be interested on it.

mingodad commented 3 years ago

I did added an AST generation based on your implementation see (https://github.com/SSW-CocoR/CocoR-CPP/issues/1#issuecomment-854027226) on this repository https://github.com/mingodad/CocoR-CPP and would be nice if someone could test it and give feedback.

rochus-keller commented 3 years ago

Do I get this right: you re-engineered the Coco/R code generator so it not only generates a parser but also a syntax tree generator (likely a concrete syntax tree, not an abstract one)? How does this get along with the semantic actions if specified with the grammar?

mingodad commented 3 years ago

Yes you are right it's a concrete syntax tree but we can filter/remove/join nodes with some custom code, if it's activated defining a macro PARSER_WITH_AST all the info is stored in an array Parser::ast_root and it's up to the user to use it or not independent of the semantic actions (it's mimicking your Syntree) see one snippet bellow, also we can compile CocoR and the generated parsers with or without wchar_t defining this macro WITHOUT_WCHAR.

Also relaxed the 2 characters limit for long comments so a dirty Lua comments can be implemented like this:

/* The order is important code generation for comments start from the last backwards */
COMMENTS FROM "--" TO lf
COMMENTS FROM "--[[" TO "]]" NESTED
COMMENTS FROM "--[=[" TO "]=]" NESTED
COMMENTS FROM "--[==[" TO "]==]" NESTED
COMMENTS FROM "--[===[" TO "]===]" NESTED
COMMENTS FROM "--[====[" TO "]====]" NESTED

And also added a limited semantic action for token declarations like the pragmas and then we can also with a custom Scanner method parse Lua long strings like this:

TK_LONG_STRING = '[' '=' {'='} '[' . (. parseLongString(); .)
...
        case 25:
            case_25:
            {t->kind = 5 /* TK_LONG_STRING */; loopState = false;parseLongString();  break;}
        case 26:
...
void Scanner::parseLongString() {
    int cdelim1, cdelim2;
    cdelim1 = tval[0];
    switch(cdelim1) {
        case '[': cdelim2 = ']'; break;
        default:
            wprintf(L"Unexpected long string delimiter %lc\n", cdelim1);
            exit(1);
    }
    int innerCount = 0;
    // get how many '=' we have
    for(int imax = tlen;  innerCount < imax; ++innerCount) {
        if(tval[innerCount+1] != L'=') break;
    }
    int nested = 1;
    //print("==", line, col, innerCount);
    for(;;) {
        if(ch == cdelim2) {
            AddCh();
            int eqCount = 0;
            while(ch == L'=') {
                if(++eqCount == innerCount) {
                    AddCh();
                    if(ch == cdelim2 && (--nested == 0)) {
                        AddCh();
                        //print("=0=" + tval.tostring() + "=1=");
                        return; //done
                    }
                }
                AddCh();
            }
            continue;
        } else if (ch == EOF) {
            t->kind = eofSym;
            break;
         }
        AddCh();
    }
}

Also I'm experimenting with showing a kind of naive TreeView for LL1 errors/warnings:

LL1 warning in Statement:213:0: TK_STRUCT is start of several alternatives
    => ClassStatement:227:4:
    -> ClassStatement:319:0:
      = TK_CLASS:320:3:
    => CommaExpr:240:4:
    -> CommaExpr:419:0:
      -> Expression:422:0:
        -> LogicalOrExp:429:0:
          -> LogicalAndExp:432:0:
            -> BitwiseOrExp:435:0:
              -> BitwiseXorExp:438:0:
                -> BitwiseAndExp:441:0:
                  -> EqExp:444:0:
                    -> CompExp:447:0:
                      -> ShiftExp:450:0:
                        -> PlusExp:453:0:
                          -> MultExp:456:0:
                            -> PrefixedExpr:459:0:
                              -> Factor:468:0:
                                = TK_CLASS:486:17:

void Parser::GMPL() {
#ifdef PARSER_WITH_AST
        Token *ntTok = new Token(); ntTok->kind = eNonTerminals::_GMPL; ntTok->line = 0; ntTok->val = coco_string_create(_SC("GMPL"));ast_root = new SynTree( ntTok ); ast_stack.Clear(); ast_stack.Add(ast_root);
#endif
        Statement();
        while (StartOf(1 /* nt   */)) {
            Statement();
        }
        if (la->kind == 50 /* "end" */) {
            Get();
#ifdef PARSER_WITH_AST
    AstAddTerminal();
#endif
            Expect(_T_SEMICOLON);
#ifdef PARSER_WITH_AST
    AstAddTerminal();
#endif
        }
        Expect(_EOF);
#ifdef PARSER_WITH_AST
    AstAddTerminal();
#endif
#ifdef PARSER_WITH_AST
        AstPopNonTerminal();
#endif
}

void Parser::Statement() {
#ifdef PARSER_WITH_AST
        bool ntAdded = AstAddNonTerminal(eNonTerminals::_Statement, _SC("Statement"), la->line);
#endif
        if (la->kind == 51 /* "model" */) {
            model_statement();
        } else if (la->kind == 52 /* "data" */) {
            data_statement();
        } else if (StartOf(2 /* nt   */)) {
            simple_statement();
        } else SynErr(123);
#ifdef PARSER_WITH_AST
        if(ntAdded) AstPopNonTerminal();
#endif
}

Can you try it and give feedback ?

rochus-keller commented 3 years ago

Interesting approach; personally, I tend not to change existing libraries as far as possible and instead implement the desired functions through separate components. Handling unicode was also possible with this approach without modifying Coco/R using a lexer which can handle unicode and a special token mapping, see e.g. https://github.com/rochus-keller/Simula/.

mingodad commented 3 years ago

I've done some more improvements to CocoR and one of then is dump a pruned syntax tree based on a simple algorithm and I'm thinking and apply it to the syntax tree construction to save memory as an option. What do you think, based on your experience ?

C++ code at https://github.com/mingodad/CocoR-CPP/blob/master/src/Parser.cpp#L1321

Simple Lua script:

local function fib(n)
    if (n < 2) then 
    return 1
    else
    return fib(n-2) + fib(n-1)
    end
end

print(fib(32))

Full syntax tree:

2   0   0   Lua
    2   1   1   statlist
        1   1   2   statement
            2   1   9   localstat
                =   1   1   36  local
                3   1   30  localfunc
                    =   1   7   25  function
                    1   1   19  str_checkname
                        =   1   16  1   fib
                    5   1   26  body
                        =   1   19  37  (
                        1   1   29  parlist
                            =   1   20  1   n
                        =   1   21  52  )
                        1   2   1   statlist
                            1   2   2   statement
                                5   2   3   ifstat
                                    =   2   5   31  if
                                    3   2   15  test_then_block
                                        1   2   17  expr
                                            1   2   44  subexpr
                                                1   2   46  simpleexp
                                                    1   2   33  suffixedexp
                                                        3   2   35  primaryexp
                                                            =   2   8   37  (
                                                            1   2   17  expr
                                                                3   2   44  subexpr
                                                                    1   2   46  simpleexp
                                                                        1   2   33  suffixedexp
                                                                            1   2   35  primaryexp
                                                                                1   2   27  singlevar
                                                                                    1   2   19  str_checkname
                                                                                        =   2   9   1   n
                                                                    1   2   47  getbinopr
                                                                        =   2   11  38  <
                                                                    1   2   44  subexpr
                                                                        1   2   46  simpleexp
                                                                            =   2   13  3   2
                                                            =   2   14  52  )
                                        =   2   16  56  then
                                        1   3   1   statlist
                                            1   3   2   statement
                                                2   3   11  retstat
                                                    =   3   2   51  return
                                                    1   3   24  explist
                                                        1   3   17  expr
                                                            1   3   44  subexpr
                                                                1   3   46  simpleexp
                                                                    =   3   9   3   1
                                    =   4   5   18  else
                                    1   5   16  block
                                        1   5   1   statlist
                                            1   5   2   statement
                                                2   5   11  retstat
                                                    =   5   2   51  return
                                                    1   5   24  explist
                                                        1   5   17  expr
                                                            3   5   44  subexpr
                                                                1   5   46  simpleexp
                                                                    2   5   33  suffixedexp
                                                                        1   5   35  primaryexp
                                                                            1   5   27  singlevar
                                                                                1   5   19  str_checkname
                                                                                    =   5   9   1   fib
                                                                        3   5   38  funcargs
                                                                            =   5   12  37  (
                                                                            1   5   24  explist
                                                                                1   5   17  expr
                                                                                    3   5   44  subexpr
                                                                                        1   5   46  simpleexp
                                                                                            1   5   33  suffixedexp
                                                                                                1   5   35  primaryexp
                                                                                                    1   5   27  singlevar
                                                                                                        1   5   19  str_checkname
                                                                                                            =   5   13  1   n
                                                                                        1   5   47  getbinopr
                                                                                            =   5   14  39  -
                                                                                        1   5   44  subexpr
                                                                                            1   5   46  simpleexp
                                                                                                =   5   15  3   2
                                                                            =   5   16  52  )
                                                                1   5   47  getbinopr
                                                                    =   5   18  46  +
                                                                1   5   44  subexpr
                                                                    1   5   46  simpleexp
                                                                        2   5   33  suffixedexp
                                                                            1   5   35  primaryexp
                                                                                1   5   27  singlevar
                                                                                    1   5   19  str_checkname
                                                                                        =   5   20  1   fib
                                                                            3   5   38  funcargs
                                                                                =   5   23  37  (
                                                                                1   5   24  explist
                                                                                    1   5   17  expr
                                                                                        3   5   44  subexpr
                                                                                            1   5   46  simpleexp
                                                                                                1   5   33  suffixedexp
                                                                                                    1   5   35  primaryexp
                                                                                                        1   5   27  singlevar
                                                                                                            1   5   19  str_checkname
                                                                                                                =   5   24  1   n
                                                                                            1   5   47  getbinopr
                                                                                                =   5   25  39  -
                                                                                            1   5   44  subexpr
                                                                                                1   5   46  simpleexp
                                                                                                    =   5   26  3   1
                                                                                =   5   27  52  )
                                    =   6   5   20  end
                        =   7   1   20  end
        1   9   2   statement
            1   9   14  exprstat
                2   9   33  suffixedexp
                    1   9   35  primaryexp
                        1   9   27  singlevar
                            1   9   19  str_checkname
                                =   9   1   1   print
                    3   9   38  funcargs
                        =   9   6   37  (
                        1   9   24  explist
                            1   9   17  expr
                                1   9   44  subexpr
                                    1   9   46  simpleexp
                                        2   9   33  suffixedexp
                                            1   9   35  primaryexp
                                                1   9   27  singlevar
                                                    1   9   19  str_checkname
                                                        =   9   7   1   fib
                                            3   9   38  funcargs
                                                =   9   10  37  (
                                                1   9   24  explist
                                                    1   9   17  expr
                                                        1   9   44  subexpr
                                                            1   9   46  simpleexp
                                                                =   9   11  3   32
                                                =   9   13  52  )
                        =   9   14  52  )
    =   10  1   0

Pruned syntax tree:

2   0   0   Lua
    2   1   1   statlist
        2   1   9   localstat
            =   1   1   36  local
            3   1   30  localfunc
                =   1   7   25  function
                1   1   19  str_checkname
                    =   1   16  1   fib
                5   1   26  body
                    =   1   19  37  (
                    1   1   29  parlist
                        =   1   20  1   n
                    =   1   21  52  )
                    5   2   3   ifstat
                        =   2   5   31  if
                        3   2   15  test_then_block
                            3   2   35  primaryexp
                                =   2   8   37  (
                                3   2   44  subexpr
                                    1   2   19  str_checkname
                                        =   2   9   1   n
                                    1   2   47  getbinopr
                                        =   2   11  38  <
                                    1   2   46  simpleexp
                                        =   2   13  3   2
                                =   2   14  52  )
                            =   2   16  56  then
                            2   3   11  retstat
                                =   3   2   51  return
                                1   3   46  simpleexp
                                    =   3   9   3   1
                        =   4   5   18  else
                        2   5   11  retstat
                            =   5   2   51  return
                            3   5   44  subexpr
                                2   5   33  suffixedexp
                                    1   5   19  str_checkname
                                        =   5   9   1   fib
                                    3   5   38  funcargs
                                        =   5   12  37  (
                                        3   5   44  subexpr
                                            1   5   19  str_checkname
                                                =   5   13  1   n
                                            1   5   47  getbinopr
                                                =   5   14  39  -
                                            1   5   46  simpleexp
                                                =   5   15  3   2
                                        =   5   16  52  )
                                1   5   47  getbinopr
                                    =   5   18  46  +
                                2   5   33  suffixedexp
                                    1   5   19  str_checkname
                                        =   5   20  1   fib
                                    3   5   38  funcargs
                                        =   5   23  37  (
                                        3   5   44  subexpr
                                            1   5   19  str_checkname
                                                =   5   24  1   n
                                            1   5   47  getbinopr
                                                =   5   25  39  -
                                            1   5   46  simpleexp
                                                =   5   26  3   1
                                        =   5   27  52  )
                        =   6   5   20  end
                    =   7   1   20  end
        2   9   33  suffixedexp
            1   9   19  str_checkname
                =   9   1   1   print
            3   9   38  funcargs
                =   9   6   37  (
                2   9   33  suffixedexp
                    1   9   19  str_checkname
                        =   9   7   1   fib
                    3   9   38  funcargs
                        =   9   10  37  (
                        1   9   46  simpleexp
                            =   9   11  3   32
                        =   9   13  52  )
                =   9   14  52  )
    =   10  1   0