peggyjs / peggy

Peggy: Parser generator for JavaScript
https://peggyjs.org/
MIT License
964 stars 65 forks source link

Or operator seems to work incorrectly #572

Closed DenizBasgoren closed 3 weeks ago

DenizBasgoren commented 3 weeks ago

Here's my parser:

{{
    const TOK_COMMENT = 'comment'
    const TOK_OP = 'operator'
    const TOK_ARG = 'parameter'
    const TOK_NUM = 'number'

    const ARG_IMMED = 'immed'
    const ARG_DIRECT = 'direct'
    const ARG_INDIRECT = 'indirect'

    const OP_ADD = '+'
    const OP_SUBTRACT = '-'
    const OP_COMPARE = '?'
    const OP_RANDOM = '..'

    let t = (range,type) => ({type, start: range().start, end: range().end})
    let er = (range,message) => ({message, start: range().start, end: range().end})
    // let l = console.log
    let l = () => {}

}}

start = line / wrongLine

num =
( '1000'
/ [1-9][0-9][0-9]
/ [1-9][0-9]
/ [0-9]
) { l('num');return {
        tree: parseInt(text()),
        tokens: [t(range,TOK_NUM)],
    }
}

wrongNum =
  num                 { l('wrongNum1');return {
                        tokens: [t(range,TOK_NUM)],
                        diags: []
                      }}
/ [0-9]+              { l('wrongNum2');return {
                        tokens: [t(range,TOK_NUM)],
                        diags: [er(range,'The number must be between 0 and 1000.')]
                      }}
/ '-' _ [0-9]+        { l('wrongNum3');return {
                        tokens: [t(range,TOK_NUM)],
                        diags: [er(range,'Negative numbers are not allowed.')]
                      }}
/ '+' _ [0-9]+        { l('wrongNum4');return {
                        tokens: [t(range,TOK_NUM)],
                        diags: [er(range,'Plus sign after the number is not allowed.')]
                      }}
/ [a-zA-Z0-9_]+        { l('wrongNum5');return {
                        tokens: [t(range,TOK_NUM)],
                        diags: [er(range,'The argument must be in one of these forms: 12, \[12\], \[\[12\]\].')]
                      }}
/ [a-zA-Z0-9_+\-*/]+  { l('wrongNum6');return {
                        tokens: [t(range,TOK_NUM)],
                        diags: [er(range,'The argument must be a number. Expressions are not allowed.')]
                      }}
/ ''                  { l('wrongNum7');return {
                        tokens: [],
                        diags: [er(range,'The argument cannot be left empty, it must be a number between 0 and 1000.')]
                      }}

arg =
  '[' _ '[' _ a:num _ ']' _ ']'     { l('arg1');return {
                                        tree: {type: ARG_INDIRECT, value:a.tree },
                                        tokens: [...a.tokens, t(range,TOK_ARG)]
                                    }}
/ '[' _ a:num _ ']'                 { l('arg2');return {
                                        tree: {type: ARG_DIRECT, value:a.tree },
                                        tokens: [...a.tokens, t(range,TOK_ARG)]
                                    }}
/ a:num                             { l('arg3');return {
                                        tree: {type: ARG_IMMED, value:a.tree },
                                        tokens: [...a.tokens, t(range,TOK_ARG)]
                                    }}

wrongArg =
  a:arg                               { l('wrongArg1');return {
                                            tokens: [...a.tokens],
                                            diags: []
                                          }}
/ t1 _ t1 _ a:num _ t2 _ t2           { l('wrongArg2');return {
                                        tokens: [...a.tokens, t(range,TOK_ARG)],
                                        diags: [er(range,'The argument must be in one of these forms: 12, \[12\], \[\[12\]\].')]
                                      }}
/ t1 _ a:num _ t2                     { l('wrongArg3');return {
                                        tokens: [...a.tokens, t(range,TOK_ARG)],
                                        diags: [er(range,'The argument must be in one of these forms: 12, \[12\], \[\[12\]\].')]
                                      }}
/ '[' _ '[' _ a:wrongNum _ ']' _ ']'  { l('wrongArg4');return {
                                        tokens: [...a.tokens, t(range,TOK_ARG)],
                                        diags: [...a.diags]
                                      }}
/ '[' _ a:wrongNum _ ']'              { l('wrongArg5');return {
                                        tokens: [...a.tokens, t(range,TOK_ARG)],
                                        diags: [...a.diags]
                                      }}
/ a:wrongNum                          { l('wrongArg6');return {
                                        tokens: [...a.tokens],
                                        diags: [...a.diags]
                                      }}

t1 = '(' / '{' / '<'
t2 = ')' / '}' / '>'

op =
  '+'   { l('op1');return {tree: OP_ADD, tokens: [t(range,TOK_OP)]}}
/ '-'   { l('op2');return {tree: OP_SUBTRACT, tokens: [t(range,TOK_OP)]}}
/ '?'   { l('op3');return {tree: OP_COMPARE, tokens: [t(range,TOK_OP)]}}
/ '..'  { l('op4');return {tree: OP_RANDOM, tokens: [t(range,TOK_OP)]}}

wrongOp =
  op                                  { l('wrongOp1');return {
                                            tokens: [t(range,TOK_OP)],
                                            diags: []
                                          }}
/  [\+\-\/\*\^\?\.\,\%\#\$\@\!\=\>\<]+ { l('wrongOp2');return {
                                        tokens: [t(range,TOK_OP)],
                                        diags: [er(range,'Only the following operators are allowed: +, -, ?, ..')]
                                      }}

equals =
  '='

wrongEquals =
  equals        { l('wrongEquals1');return {
                    tokens: [],
                    diags: []
                }}
/  a:wrongOp    { l('wrongEquals2');return {
                  tokens: [...a.tokens],
                  diags: [er(range, 'Equals sign (=) is expected here.')]
                }}
/ [a-zA-Z0-9_]+ { l('wrongEquals3');return {
                  tokens: [t(range,TOK_OP)],
                  diags: [er(range, 'Equals sign (=) is expected here.')]
                }}
/ ''            { l('wrongEquals4');return {
                  tokens: [],
                  diags: [er(range, 'Equals sign (=) is expected here.')]
                }}

inst = a:arg _ equals _ b:arg _ c:op _ d:arg       { l('inst');return {
                                                        tree: {dest:a.tree, arg1:b.tree, op:c.tree, arg2:d.tree},
                                                        tokens: [...a.tokens, ...b.tokens, ...c.tokens, ...d.tokens]
                                                   }}

wrongInst =
a:inst                                                                { l('wrongInst1');return {
                                                                        tokens: [...a.tokens],
                                                                        diags: []
                                                                      }}
/  a:wrongArg _ b:wrongEquals _ c:wrongArg _ d:wrongOp _ e:wrongArg   { l('wrongInst2');return {
                                                                        tokens: [...a.tokens, ...b.tokens, ...c.tokens, ...d.tokens, ...e.tokens],
                                                                        diags: [...a.diags, ...b.diags, ...c.diags, ...d.diags, ...e.diags]
                                                                      }}

comment =
  '//' .*   {l('comment');return {tokens: [t(range,TOK_COMMENT)]}}

wrongComment =
  comment                                         { l('wrongComment1');return {
                                                    tokens: [t(range,TOK_COMMENT)],
                                                    diags: []
                                                  }}
/  ('#' / '%' / '%' / '/' / '\\\\' / '\\' ) .*     { l('wrongComment2');return {
                                                    tokens: [],
                                                    diags: [er(range, 'Comments must start with //')]
                                                  }}
/ .+                                              { l('wrongComment3');return {
                                                    tokens: [],
                                                    diags: [er(range, 'Comments must start with //')]
                                                  }}

line =
  _ a:inst? _ b:comment?                {l('line');return {
                                            tree: a?.tree,
                                            tokens: [...(a?a.tokens:[]), ...(b?b.tokens:[]) ]
                                        }}

wrongLine = 
  _ a:wrongInst? _ b:wrongComment?      { l('wrongLine');return {
                                          tokens: [...(a?a.tokens:[]), ...(b?b.tokens:[]), ],
                                          diags: [...(a?a.diags:[]), ...(b?b.diags:[])]
                                        }}

_ = [\x20\x09]*

My input string is []=[]++[].

Please help me out with this one

hildjj commented 3 weeks ago

Can you cut this example down to be smaller/less complicated in a form that still shows off the issue, please?

DenizBasgoren commented 3 weeks ago

Alright, but just by judging from the results, if a fails and b matches, then a / b should match as well, but what I observe is the opposite...

hildjj commented 3 weeks ago

Oh, never mind. I see your issue.

line =
  _ a:inst? _ b:comment?

This rule matches the empty string.

DenizBasgoren commented 3 weeks ago

yes, an empty string is a correct match for a "line", but that's not the issue here. I provided a specific input above

hildjj commented 3 weeks ago

That rule always matches. The second half of the / will never fire because of that.

hildjj commented 3 weeks ago

Removing the / in start might or might not fix your issue, depending on what behavior you want.

hildjj commented 3 weeks ago

I'd probably go with something like:

start = blankLine / instLine / wrongLine
blankLine = _ b:comment?
instLine = _ a:inst _ b:comment?
hildjj commented 3 weeks ago

Sorry, that has the same problem. Like this then:

blankLine = _ comment? &EOL
EOL = '\n' / EOF
EOF = !.

You could probably keep your current approach and just add that &EOL to your existing line rule.

DenizBasgoren commented 3 weeks ago

Alright, that might be a solution, but the lack of logic here still bugs me. Please try the input []=[]++[] and see for yourself:

start = line here line matches 0 characters, so the overall match fails. that's expected.

start = wrongLine this matches the entire thing, and the overall match succeeds. that's expected as well. the entire string is consumed.

start = line / wrongLine now simply by logic, a / b should match if either of a, b matches. in this case, wrongLine matches, so this should match as well. But what I'm observing is, this fails. Can you explain please what might be the mistake here

Mingun commented 3 weeks ago

The reason because of implicit EOF rule that checks that there is not more input that you should mentally add to all your examples:

start = line EOF; // `line` matched "", `EOF` failed
start = wrongLine EOF; // `wrongLine` matched all possible input, `EOF` matched no (left) input
start = (line / wrongLine) EOF; // !!!! `line` matched "", `EOF` failed

As you can see, the combination of line and wrongLine is not just manual parse of one rule and then another. The start = line failed exactly because of hidden EOF rule which should guarantee that all input was analyzed. The line rule itself is matched as @hildjj already pointed out

hildjj commented 3 weeks ago

start = line here line matches 0 characters, so the overall match fails. that's expected.

Not quite. line matches 0 characters, so the overall match succeeds.

hildjj commented 3 weeks ago

More clarity: the line rule matches. The parser fails because of what mingun said; when there is input left over after the startRule finishes, parsing fails. This is equivalent to there being an implicit EOF rule at the end of your startRule.

DenizBasgoren commented 3 weeks ago

@Mingun 's line of reasoning seems consistent. I think this edge case deserves it's own section or at least a mention in the documentation (if it hasn't already). Thanks @Mingun @hildjj

My solution is to change

line =
  _ a:inst? _ b:comment? 
wrongLine = 
  _ a:wrongInst? _ b:wrongComment?

to

line =
  _ a:inst? _ b:comment? !. 
wrongLine = 
  _ a:wrongInst? _ b:wrongComment? !.