yhirose / cpp-peglib

A single file C++ header-only PEG (Parsing Expression Grammars) library
MIT License
883 stars 112 forks source link

Understanding and customizing --trace #200

Closed mingodad closed 2 years ago

mingodad commented 2 years ago

I'm getting trouble fully understanding the trace output, I'm trying to find why this grammar http://ingmarschlecht.de/gamsToLatex/ that I converted to peg understood by peglib (see bellow) and it's not matching the input like the original, using the --trace option give a long list that's not easy to grasp (I would like something like https://github.com/fasterthanlime/pegviz) but then trying to make the trace output in a way that can help me like I did a bit here https://github.com/yhirose/cpp-peglib/issues/194 I looked at the code to try to trim down it's output by not showing the definitions marked with igonore ~ but looking through the code in peglib.h I couldn't find how to check if peg::Ope &ope is marked as ignore to skip it.

Can someone give any hint/help here ?

  if (opt_trace || opt_trace2) {
    size_t prev_pos = 0;
    parser.enable_trace(
        [&](const peg::Ope &ope, const char *s, size_t /*n*/,
            const peg::SemanticValues & /*sv*/, const peg::Context &c,
            const std::any & /*dt*/) {
          if(ope.IsMarkedToIgnore) return;  ///!!!!<<< pseudo code that I want here
          auto pos = static_cast<size_t>(s - c.s);

Converted grammar:

 start <-
     gamsCode+

 gamsCode <-
     _ gamsCodePart _

 gamsCodePart <-
     executionStatement / definition / assignment / solveStatement / multiLineComment / dollarCommand / lonelySemicolon

 dollarCommand <-
     inlineComStatement / offInlineComStatement / eolComStatement / offEolComStatement / genericDollarCommand

 genericDollarCommand <-
     "$" [^\n]+

 eolComStatement <-
     "$EOLCOM"i _ [^\n\t ]+ _

 offEolComStatement <-
     "$offeolcom"i [^\n]*

 inlineComStatement <-
     "$inlinecom"i _ [^\n\t ]+ _ [^\n\t ]+

 offInlineComStatement <-
     "$offinlinecom"i [^\n]*

 multiLineComment <-
     "$onecho"i ( ! "$offecho"i . )+ "$offecho"i / "$ontext"i ( ! "$offtext"i . )+ "$offtext"i

 executionStatement <-
     optionStatement / solveStatement / abortStatement / controlStructureStatement / defaultStatement

 abortStatement <-
     _ "abort"i ( _ dollarCondition / whiteSpacePart ) _ defaultArguments? _ ";"

 controlStructureStatement <-
     _ ( "loop"i / "while"i / "if"i ) _ "(" _ set_or_setsInBrackets dollarCondition? _ "," gamsCode* ")" / forStatement / repeatStatement

 repeatStatement <-
     _ "repeat"i _ "(" gamsCode* "," _ set_or_setsInBrackets dollarCondition? ")"

 forStatement <-
     _ "for"i _ "(" _ set_or_setsInBrackets dollarCondition? _ "to"i _ set_or_setsInBrackets _ "," gamsCode* ")"

 defaultStatement <-
     ( "display"i / "execute"i ) whiteSpacePart defaultArguments ";"

 defaultArguments <-
     _ defaultArgument _ ( "," _ defaultArgument _ )* _

 defaultArgument <-
     [^,;]+ _

 solveStatement <-
     _ "solve"i _ literalString _ solveStatement_secondPart ";"

 solveStatement_secondPart <-
     modelTypeStatement _ objectiveStatement? / objectiveStatement _ modelTypeStatement

 modelTypeStatement <-
     ( "using"i / "use"i ) _ literalString _

 objectiveStatement <-
     ( "minimizing"i / "maximizing"i / "min"i / "max"i ) _ literalString _

 lonelySemicolon <-
     _ ";"

 assignment <-
     _ parameter _ "=" _ equationSide ";"

 dollarCondition <-
     "$" _ "(" equationSide dollarCondition? _ ")" _ / "$" functionalPart

 optionStatement <-
     "option"i [^;]+ ";"

 definition <-
     multipleItemsDefinition / aliasDefinition / parameterDefinition / tableDefinition / equationDefinition / assignment

 descriptionString <-
     noSetCharacter+

 noSetCharacter <-
     ! "set:"i [^/\n;|]

 descriptionStringWithPipe <-
     [^/\n;]+

 latexDefinition <-
     "|LaTeX:"i _ ( latexSetDefinition / descriptionString )

 latexSetDefinition <-
     _ "index:"i _ descriptionString _ "set:"i _ descriptionString

 multipleItemsDefinition <-
     multipleItemTypes whiteSpacePart itemDefinition+ _ ";"

 itemDefinition <-
     _ literalString noNewLineWhiteSpace? setsOrNothing? noNewLineWhiteSpace? descriptionString? latexDefinition? descriptionStringWithPipe? _ itemValueDefinition?

 itemValueDefinition <-
     "/" _ [^/]+ _ "/"

 multipleItemTypes <-
     parameterType "s"i?

 parameterDefinition <-
     parameterType whiteSpacePart itemDefinition _ ";"

 parameterType <-
     "set"i / "parameter"i / "variable"i / "binary variable"i / "positive variable"i / "scalar"i / "equation"i / "model"i

 tableDefinition <-
     "table"i _ literalString _ setsOrNothing? _ descriptionString latexDefinition? [^;]+ ";"

 aliasDefinition <-
     "alias"i _ aliasItem ( "," _ aliasItem )* _ ";"

 aliasItem <-
     _ "(" sets ")"

 equationDefinition <-
     literalString setsOrNothing? _ dollarCondition? _ ".." equationSide equationOperator equationSide ";" _

 equationSide <-
     _ logicRelExpression _

 logicRelExpression <-
     _ logicRelOperator _ logicRelExpression / _ numericalRelExpression _ logicRelOperator _ logicRelExpression / numericalRelExpression

 logicRelOperator <-
     "not"i / "and"i / "or"i / "xor"i

 numericalRelExpression <-
     ( _ additiveExpression _ numericalRelOperator _ numericalRelExpression ) / additiveExpression

 numericalRelOperator <-
     "<" / "<=" / "<>" / ">=" / ">" / "gt"i / "lt"i / "eq"i / ! equationOperator "="

 additiveExpression <-
     ( _ multiplicativeExpression? _ [+-] _ additiveExpression ) / multiplicativeExpression

 multiplicativeExpression <-
     ( _ exponentialExpression _ [*/] _ multiplicativeExpression ) / exponentialExpression

 exponentialExpression <-
     ( _ functionalPart _ "**" _ exponentialExpression ) / functionalPart

 functionalPart <-
     primary / function / parameter #/ someString

 primary <-
     float / integer / ( _ "(" _ logicRelExpression _ ")" _ )

 float <-
     ( [0-9]+ "." [0-9]+ )

 someString <-
     [^=]+

 parameter <-
     literalString suffix? _ setsOrNothing? dollarCondition?

 setsOrNothing <-
     "(" sets ")"

 suffix <-
     "." suffixLetters

 suffixLetters <-
     "lo"i / "l"i / "fx"i / "up"i / "m"i / "nd"i / "nz"i / "nr"i / "nw"i / literalString

 sets <-
     set ( "," sets )?

 set <-
     _ "'" [^\']+ "'" _ / _ '"' [^\"]+ '"' _ / _ logicRelExpression _

 function <-
     ( functionName _ "(" set_or_setsInBrackets dollarCondition? "," _ logicRelExpression _ ")" _ dollarCondition? _ )

 functionName <-
     "sum"i / "smax"i / "prod"i / "smin"i

 set_or_setsInBrackets <-
     _ set _ / _ "(" _ sets _ ")" _

 sqrt <-
     ( "sqrt" _ "(" additiveExpression ")" )

 equationOperator <-
     "=e="i / "=g="i / "=l="i

 integer <-
     [0-9]+

 _ <-
     whiteSpacePart*

 whiteSpacePart <-
     comment / [ \n\t\r] / endOfLineComment

 endOfLineComment <-
     [^\n]+

 noNewLineWhiteSpace <-
     [ \t\r]+

 literalString <-
     ( [A-Za-z_] [A-Za-z0-9_]* )

 comment <-
     ( [\n] "*" ) [^\n]*
mingodad commented 2 years ago

Also including in the README a small description of the elements that appear on the trace would be nice like it was requested here https://github.com/yhirose/cpp-peglib/pull/88#issuecomment-1115100476 .

mingodad commented 2 years ago

After writing this I went back to peg/leg https://github.com/mingodad/peg because it's simpler and straightforward (although with less features) and with it's trace I found the main problem with my converted grammar that was a missing &{} predicate not converted by my home made conversion tool.

So replacing:

whiteSpacePart <-
     comment / [ \n\t\r] / endOfLineComment

by:

whiteSpacePart <-
     comment / [ \n\t\r] / comment

Got it to parse the example input, but the questions about how to customize/understand peglib --trace option remain.

mingodad commented 2 years ago

Could peglib give an output like this one https://github.com/ChrisHixon/chpeg/issues/4#issuecomment-1141284958 ?

yhirose commented 2 years ago

@mingodad, I think it's possible. Could you explain more fully each field?

  id       total      %     success        fail  definition
               1   0.00           1              start
   1          15   0.00          14           1  gamsCode
   2          15   0.00          14           1  gamsCodePart
mingodad commented 2 years ago

Yes of course !

For each definition/rule in the grammar used to parse an input we have:

id <- is an internal unique number that maps to one grammar definition/rule

total <- is the total number of times the definition/rule was invoked

% <- percentage relative to the total execution, after summing all times each definition/rule was invoked apply this formula (total/sum_total_all_as_double)*100.0

success <- is the number of times when the definition/rule was invoked and succeded

fail <- is the number of times when the definition/rule was invoked and failed

definition <- is the definition/rule name

yhirose commented 2 years ago

peglint now supports --profile option.

mingodad commented 2 years ago

Thank you ! I propose extending it a bit like shown bellow, then at the end we get something like:

peglint --profile lua.peglib lpegrex.lua 
lua.peglib:348:1: 'EXTRA_TOKENS' is not referenced.
  id       total      %     success        fail  definition
   0           1   0.00           1           0  chunk
   1           1   0.00           0           1  SHEBANG
   2        4322   4.16        4322           0  SKIP
   3       11449  11.03        4871        6578  SPACE
   4       10837  10.44         670       10167  LINEBREAK
...
  92          11   0.01          11           0  funcname
  93          11   0.01           0          11  ColonIndex
  94           1   0.00           1           0  idlist

          103830              33962       69868  Total counters
                              32.71       67.29  % success/fail

My propose:

diff --git a/lint/peglint.cc b/lint/peglint.cc
index 4beb940..88edf4b 100644
--- a/lint/peglint.cc
+++ b/lint/peglint.cc
@@ -221,15 +221,28 @@ int main(int argc, const char **argv) {
               std::cout << "  id       total      %     success        fail  "
                            "definition"
                         << std::endl;
+              size_t total_total, total_success = 0, total_fail = 0;
+              char buff[BUFSIZ];
               for (auto &[name, success, fail] : stats) {
-                char buff[BUFSIZ];
                 auto total = success + fail;
+                total_success += success;
+                total_fail += fail;
                 auto ratio = total * 100.0 / stats_item_total;
                 sprintf(buff, "%4zu  %10lu  %5.2f  %10lu  %10lu  %s", id, total,
                         ratio, success, fail, name.c_str());
                 std::cout << buff << std::endl;
                 id++;
               }
+              std::cout << std::endl;
+              total_total = total_success + total_fail;
+              sprintf(buff, "%4s  %10lu  %5s  %10lu  %10lu  %s", "",
+                      total_total, "", total_success, total_fail,
+                      "Total counters");
+              std::cout << buff << std::endl;
+              sprintf(buff, "%4s  %10s  %5s  %10.2f  %10.2f  %s", "", "",
+                      "", total_success*100.0/total_total,
+                      total_fail*100.0/total_total, "% success/fail");
+              std::cout << buff << std::endl;
             }
           }
         },
mingodad commented 2 years ago

Also if somehow we could have the last 2 lines shown by default on the online playground it would help fine tune a grammar. I'm using the total/percentage counters to visualize how any grammar change impact backtrack/cycles.

mingodad commented 2 years ago

Here is the output shown above but now with --packrat:

peglint --packrat --profile lua.peglib lpegrex.lua 
lua.peglib:348:1: 'EXTRA_TOKENS' is not referenced.
  id       total      %     success        fail  definition
   0           1   0.00           1           0  chunk
   1           1   0.00           0           1  SHEBANG
   2        4090   4.28        4090           0  SKIP
   3       10754  11.25        4649        6105  SPACE
   4        8103   8.47         638        7465  LINEBREAK
...
  93          11   0.01           0          11  ColonIndex
  94           1   0.00           1           0  idlist

           95621              32026       63595  Total counters
                              33.49       66.51  % success/fail
yhirose commented 2 years ago

Is this result is good or bad?

Here is the output shown above but now with --packrat:

peglint --packrat --profile lua.peglib lpegrex.lua 
lua.peglib:348:1: 'EXTRA_TOKENS' is not referenced.
  id       total      %     success        fail  definition
   0           1   0.00           1           0  chunk
   1           1   0.00           0           1  SHEBANG
   2        4090   4.28        4090           0  SKIP
   3       10754  11.25        4649        6105  SPACE
   4        8103   8.47         638        7465  LINEBREAK
...
  93          11   0.01           0          11  ColonIndex
  94           1   0.00           1           0  idlist

           95621              32026       63595  Total counters
                              33.49       66.51  % success/fail
mingodad commented 2 years ago

Neither comparing it with and without --packrat we can see how much improvement it's bringing when parsing this particular input.

          103830              33962       69868  Total counters
                              32.71       67.29  % success/fail
...
           95621              32026       63595  Total counters
                              33.49       66.51  % success/fail
mingodad commented 2 years ago

See this comment https://github.com/ChrisHixon/chpeg/issues/4#issuecomment-1146185093 for a performance comparison between chpeg and peglib using this grammar for peglib:

# From bison 3.8.2 src/parse-gram.y and src/scan-gram.l
# ../../chpeg-dad/examples/chpeg_nocase bison.chpeg ../../bison-dad/src/parse-gram.y
input <-
    sp prologue_declaration* "%%" sp grammar epilogue? YYEOF

prologue_declaration <-
    grammar_declaration
    / PROLOGUE
    / ("%<flag>" / "%locations") sp
    / "%define" sp variable value?
    / "%header" sp string_opt?
    / "%error-verbose" sp
    / "%expect"[-_]"rr" sp INT_LITERAL
    / "%expect" sp INT_LITERAL
    / "%file"[-_]"prefix" sp eqopt STRING
    / "%glr"[-_]"parser" sp
    / "%initial"[-_]"action" sp braces
    / "%language" sp STRING
    / "%name"[-_]"prefix" sp ("=" sp)? STRING
    / "%nondeterministic-parser"
    / "%no"[-_]"lines" sp
    / "%output" sp STRING
    / ("%param" / "%parse"[-_]"param" / "%lex"[-_]"param") sp params
    / "%pure"[-_]"parser" sp
    / "%require" sp STRING
    / "%skeleton" sp STRING
    / "%token"[-_]"table" sp
    / "%verbose" sp
    / "%yacc" sp
    / error sp SEMICOLON
    / SEMICOLON

params <-
    braces+

grammar_declaration <-
    symbol_declaration
    / "%start" sp  symbol+
    / code_props_type braces generic_symlist_item*
    / "%default"[-_]"prec" sp
    / "%no"[-_]"default"[-_]"prec" sp
    / "%code" sp ID? braces
    / "%union" sp union_name? braces

code_props_type <-
    "%destructor" sp
    / "%printer" sp

union_name <-
    ID

symbol_declaration <-
    "%nterm" sp nterm_decls
    / "%token" sp token_decls
    / "%type" sp symbol_decls
    / precedence_declarator token_decls_for_prec

precedence_declarator <-
    "%left" sp
    / "%right" sp
    / "%nonassoc" sp
    / "%precedence" sp

string_opt <-
    STRING

generic_symlist_item <-
    symbol
    / tag

tag_opt <-
    tag

tag <-
    "<" ( "*" / (!">" .)*) ">" sp

nterm_decls <-
    token_decls

token_decls <-
    (tag? token_decl+)+

token_decl <-
    id int_opt? alias?

int_opt <-
    INT_LITERAL sp

alias <-
    string_as_id
    / TSTRING

token_decls_for_prec <-
    (tag? token_decl_for_prec+)+

token_decl_for_prec <-
    id int_opt?
    / string_as_id

symbol_decls <-
    (tag? symbol+)+

grammar <-
    rules_or_grammar_declaration*

rules_or_grammar_declaration <-
    rules
    / grammar_declaration SEMICOLON
    / error SEMICOLON

rules <-
    id_colon named_ref_opt? COLON rhses? SEMICOLON?

rhses <-
    rhs* (PIPE rhs*)*

rhs <-
    symbol named_ref_opt?
    / tag_opt? braces named_ref_opt?
    / "%"? braces
    / "%empty" sp
    / "%prec" sp symbol
    / "%dprec" sp INT_LITERAL
    / "%merge" sp tag
    / "%expect"[-_]"rr" sp INT_LITERAL
    / "%expect" sp INT_LITERAL

named_ref <-
    '[' sp ID ']' sp

named_ref_opt <-
    named_ref !':'

variable <-
    ID

value <-
    ID
    / STRING
    / braces

id <-
    ID
    / CHAR_LITERAL

id_colon <-
    ID &([:] / named_ref &[:])

symbol <-
    id !':'
    / string_as_id

string_as_id <-
    STRING

~epilogue <-
    "%%" .*

YYEOF <-
    !.

#Tokens

letter <-
    [.a-zA-Z_]

ID <-
    <letter (letter / [-0-9])*> sp

int <-
    [0-9]+ sp

xint <-
    '0'[xX][0-9a-fA-F]+ sp

INT_LITERAL <-
    int
    / xint

eol <-
    [\n][\r]?
    / [\r][\n]?

# UTF-8 Encoded Unicode Code Point, from Flex's documentation.
#mbchar  <-  [\x09\x0A\x0D\x20-\x7E] / [\xC2-\xDF][\x80-\xBF] / \xE0[\xA0-\xBF][\x80-\xBF] / [\xE1-\xEC\xEE\xEF]([\x80-\xBF]{2}) / \xED[\x80-\x9F][\x80-\xBF] / \xF0[\x\90-\xBF]([\x80-\xBF]{2}) / [\xF1-\xF3]([\x80-\xBF]{3}) / \xF4[\x80-\x8F]([\x80-\xBF]{2})

# Zero or more instances of backslash-newline.  Following GCC, allow
#   white space between the backslash and the newline.
splice <-
    ('\\'[ \f\t\v]* eol)*

comment <-
    [/] ([/] (!eol .)* eol? / [*] (!"*/" .)* "*/")

~sp <-
    (
    [ \t\n\r] #[[:space:]]*
    / comment
    )*

# An equal sign, with optional leading whitespaces. This is used in some
#   deprecated constructs.
eqopt <-
    (sp EQUAL)?

COLON <-  ":" sp
EQUAL <-  "=" sp
PIPE <- "|" sp
SEMICOLON <-  ";" sp

~PROLOGUE <- "%{" (!"%}" .)* "%}" sp

# Code in between braces.
~braces <-
    "{" sp <braces_body*> sp "}" sp

braces_body <-
    &[{"'] (braces / STRING)
    / ! '}' .

STRING <-
     ( ['] <( ! ( ['] / eol ) char )*> ['] ) sp
    / ( ["] <( ! ( ["] / eol ) char )*> ["] ) sp

TSTRING <-
    "_(" STRING ")" sp

CHAR_LITERAL <-
    STRING

char <-
     ( '\\' [-abefnrtv'"\[\]\\] )
    / ( '\\' 'x' [0-9A-Fa-f] [0-9A-Fa-f] )
    / ( '\\' 'x' [0-9A-Fa-f] )
    / ( '\\' [0-3] [0-7] [0-7] )
    / ( '\\' [0-7] [0-7]? )
    / ( ! '\\' . )

error <-
    "error" sp