yhirose / cpp-peglib

A single file C++ header-only PEG (Parsing Expression Grammars) library
MIT License
900 stars 112 forks source link

Help with peglib grammar #191

Closed mingodad closed 2 years ago

mingodad commented 2 years ago

I'm trying to manually extract the peglib grammar (see bellow) and I've got a grammar that the online playground says it's valid but when I try to parse it with itself I'm getting an error recognizing a charset class.

There is a working peglib grammar somewhere (other than programmed in peglib.h) ?

The error when trying to parse it with itself:

40:28 syntax error, unexpected '-', expecting <lit_ope>, <LiteralI>, <IdentRest>, <IdentStart>.

The manually extracted peglib grammar:

# Setup PEG syntax parser

Grammar <- Spacing  Definition+ EndOfFile

Definition <-
    Ignore  IdentCont Parameters LEFTARROW  Expression Instruction?
    / Ignore Identifier LEFTARROW Expression Instruction?

Expression <- Sequence (SLASH Sequence)*

Sequence <- (CUT / Prefix)*

Prefix <- (AND / NOT)? SuffixWithLabel

SuffixWithLabel <- Suffix (LABEL Identifier)?

Suffix <- Primary Loop?

Loop <- QUESTION / STAR / PLUS / Repetition

Primary <-
    Ignore IdentCont Arguments !LEFTARROW
    / Ignore Identifier !(Parameters? LEFTARROW)
    / OPEN Expression CLOSE
    / BeginTok Expression EndTok
    / BeginCapScope Expression EndCapScope
    / BeginCap Expression EndCap
    / BackRef
    / LiteralI
    / Dictionary
    / Literal
    / NegatedClass
    / Class
    / DOT

Identifier <- IdentCont Spacing

IdentCont <- IdentStart IdentRest*

IdentStart <- !"↑" !"⇑" ([a-zA-Z_%] / [\x0080-\xFFFF])

IdentRest <- IdentStart / [0-9]

Dictionary <- LiteralD (PIPE LiteralD)+

lit_ope <-
    ['] <(!['] Char)*>  ['] Spacing
    / ["] <(!["] Char)*> ["] Spacing

Literal <- lit_ope

LiteralD <- lit_ope

LiteralI <-
    ['] <(!['] Char)*> "'i" Spacing
    / ["] <(!["] Char)*> '"i' Spacing

# NOTE: The original Brian Ford's paper uses 'zom' instead of 'oom'.
Class <- '[' !'^' <(!']' Range)+> ']' Spacing

NegatedClass <- "[^" <(!']' Range)+> ']' Spacing

Range <- (Char '-' !']' Char) / Char

Char <-
    '\\' [nrt'"[\]\^]
    / '\\' [0-3] [0-7] [0-7]
    / '\\' [0-7] [0-7]?
    / "\\x" [0-9a-fA-F] [0-9a-fA-F]?
    / "\\u" (('0' [0-9a-fA-F]) / "10") [0-9a-fA-F]{4, 4} / [0-9a-fA-F]{4, 5}
    / !'\\' .

Repetition <- BeginBlacket RepetitionRange EndBlacket

RepetitionRange <-
    Number COMMA Number
    / Number COMMA
    / Number
    / COMMA Number

Number <- [0-9]+ Spacing

LEFTARROW <- ("<-" / "←") Spacing

~SLASH <- '/' Spacing
~PIPE <- '|' Spacing
AND <- '&' Spacing
NOT <- '!' Spacing
QUESTION <- '?' Spacing
STAR <- '*' Spacing
PLUS <- '+' Spacing
~OPEN <- '(' Spacing
~CLOSE <- ')' Spacing
DOT <- '.' Spacing

CUT <- "↑" Spacing
~LABEL <- ('^' / "⇑") Spacing

~Spacing <- (Space / Comment)*
Comment <- '#' (!EndOfLine .)* EndOfLine
Space <- ' ' / '\t' / EndOfLine
EndOfLine <- "\r\n" / '\n' / '\r'
EndOfFile <- !.

~BeginTok <- '<' Spacing
~EndTok <- '>' Spacing

~BeginCapScope <- '$' '(' Spacing
~EndCapScope <- ')' Spacing

BeginCap <- '$' <IdentCont> '<' Spacing
~EndCap <- '>' Spacing

BackRef <- '$' <IdentCont> Spacing

IGNORE <- '~'

Ignore <- IGNORE?
Parameters <- OPEN Identifier (COMMA Identifier)* CLOSE
Arguments <- OPEN Expression (COMMA Expression)* CLOSE
~COMMA <- ',' Spacing

# Instruction grammars
Instruction <-
    BeginBlacket (InstructionItem (InstructionItemSeparator InstructionItem)*)? EndBlacket

InstructionItem <- PrecedenceClimbing / ErrorMessage / NoAstOpt
~InstructionItemSeparator <- ';' Spacing

~SpacesZom <- Space*
~SpacesOom <- Space+
~BeginBlacket <- '{' Spacing
~EndBlacket <- '}' Spacing

# PrecedenceClimbing instruction
PrecedenceClimbing <- "precedence" SpacesOom PrecedenceInfo (SpacesOom PrecedenceInfo)* SpacesZom

PrecedenceInfo <- PrecedenceAssoc (~SpacesOom PrecedenceOpe)+

PrecedenceOpe <-
    ['] <(!(Space / [']) Char)*> [']
    / ["] <(!(Space / ["]) Char)*> ["]
    / <(!(PrecedenceAssoc / Space / '}') . )+>

PrecedenceAssoc <- [LR]

# Error message instruction
ErrorMessage <- "message" SpacesOom LiteralD SpacesZom

# No Ast node optimazation instruction
NoAstOpt <- "no_ast_opt" SpacesZom
mingodad commented 2 years ago

if I move Class and NegatedClass to the top in Primary than I can parse https://github.com/yhirose/culebra/blob/master/misc/culebra.peg (after add a newline at the endo of it) but when trying to parse itself I then get:

68:4 syntax error, unexpected '\', expecting <Char>.
mingodad commented 2 years ago

At the end I want to generate EBNF from any grammar including peglib grammar itself something like I did here https://github.com/mingodad/lalr-parser-test for bison/byacc/lemon and here https://github.com/mingodad/peg .

Ideally I would iterate over the ordered list of rules and output then almost as they were written only changing <- by ::=, / by | and replace some tokens that need different representation.

Something like this one for the peglib grammar I've got so far.

Copy and paste the EBNF shown bellow at https://www.bottlecaps.de/rr/ui on the tab Edit Grammar then click on the tab View Diagram.


//To be viewd at https://www.bottlecaps.de/rr/ui

Grammar ::=
     Spacing Definition+ EndOfFile

Spacing ::=
     ( Space | Comment )*

Definition ::=
     ( Ignore IdentCont Parameters LEFTARROW Expression Instruction? )
    | ( Ignore Identifier LEFTARROW Expression Instruction? )

EndOfFile ::=
     _NOT_  .

Ignore ::=
     IGNORE?

IdentCont ::=
     IdentStart IdentRest*

Parameters ::=
     OPEN Identifier ( COMMA Identifier )* CLOSE

LEFTARROW ::=
     ( "<-" | "←" ) Spacing

Expression ::=
     Sequence ( SLASH Sequence )*

Instruction ::=
     BeginBlacket ( InstructionItem ( InstructionItemSeparator InstructionItem )* )? EndBlacket

Identifier ::=
     IdentCont Spacing

Sequence ::=
     ( CUT | Prefix )*

SLASH ::=
     '/' Spacing

CUT ::=
     "↑" Spacing

Prefix ::=
     ( AND | NOT )? SuffixWithLabel

AND ::=
     '&' Spacing

NOT ::=
     '!' Spacing

SuffixWithLabel ::=
     Suffix ( LABEL Identifier )?

Suffix ::=
     Primary Loop?

LABEL ::=
     ( '^' | "⇑" ) Spacing

Primary ::=
     NegatedClass
    | Class
    | ( Ignore IdentCont Arguments _NOT_  LEFTARROW )
    | ( Ignore Identifier _NOT_  ( Parameters? LEFTARROW ) )
    | ( OPEN Expression CLOSE )
    | ( BeginTok Expression EndTok )
    | ( BeginCapScope Expression EndCapScope )
    | ( BeginCap Expression EndCap )
    | BackRef
    | LiteralI
    | Dictionary
    | Literal
    | DOT

Loop ::=
     QUESTION
    | STAR
    | PLUS
    | Repetition

QUESTION ::=
     '?' Spacing

STAR ::=
     '*' Spacing

PLUS ::=
     '+' Spacing

Repetition ::=
     BeginBlacket RepetitionRange EndBlacket

NegatedClass ::=
     "[^" ( _NOT_  ']' Range )+ ']' Spacing

Class ::=
     '[' _NOT_  '^' ( _NOT_  ']' Range )+ ']' Spacing

Arguments ::=
     OPEN Expression ( COMMA Expression )* CLOSE

OPEN ::=
     '(' Spacing

CLOSE ::=
     ')' Spacing

BeginTok ::=
     '<' Spacing

EndTok ::=
     '>' Spacing

BeginCapScope ::=
     '$' '(' Spacing

EndCapScope ::=
     ')' Spacing

BeginCap ::=
     '$' IdentCont '<' Spacing

EndCap ::=
     '>' Spacing

BackRef ::=
     '$' IdentCont Spacing

LiteralI ::=
     ( ['] ( _NOT_  ['] Char )* "'i" Spacing )
    | ( ["] ( _NOT_  ["] Char )* '"i' Spacing )

Dictionary ::=
     LiteralD ( PIPE LiteralD )+

Literal ::=
     lit_ope

DOT ::=
     '.' Spacing

IdentStart ::=
     _NOT_  ( "↑" | "⇑" ) ( [a-zA-Z_%] | [\x0080-\xFFFF] )

IdentRest ::=
     IdentStart
    | [0-9]

LiteralD ::=
     lit_ope

PIPE ::=
     '|' Spacing

lit_ope ::=
     ( ['] ( _NOT_  ['] Char )* ['] Spacing )
    | ( ["] ( _NOT_  ["] Char )* ["] Spacing )

Char ::=
     ( '\\' [nrt'"#x1b#x1d\\^] )
    | ( '\\' [0-3] [0-7] [0-7] )
    | ( '\\' [0-7] [0-7]? )
    | ( "\\x" [0-9a-fA-F] [0-9a-fA-F]? )
    | ( "\\u" ( ( '0' [0-9a-fA-F] ) | "10" ) [0-9a-fA-F] )
    | ( [0-9a-fA-F] )
    | ( _NOT_  '\\' . )

Range ::=
     ( Char '-' Char )
    | Char

BeginBlacket ::=
     '{' Spacing

RepetitionRange ::=
     ( Number COMMA Number )
    | ( Number COMMA )
    | Number
    | ( COMMA Number )

EndBlacket ::=
     '}' Spacing

Number ::=
     [0-9]+ Spacing

COMMA ::=
     ',' Spacing

Space ::=
     ' '
    | '\t'
    | EndOfLine

Comment ::=
     '#' ( _NOT_  EndOfLine . )* EndOfLine

EndOfLine ::=
     "\r\n"
    | '\n'
    | '\r'

IGNORE ::=
     '~'

InstructionItem ::=
     PrecedenceClimbing
    | ErrorMessage
    | NoAstOpt

InstructionItemSeparator ::=
     ';' Spacing

PrecedenceClimbing ::=
     "precedence" SpacesOom PrecedenceInfo ( SpacesOom PrecedenceInfo )* SpacesZom

ErrorMessage ::=
     "message" SpacesOom LiteralD SpacesZom

NoAstOpt ::=
     "no_ast_opt" SpacesZom

SpacesZom ::=
     Space*

SpacesOom ::=
     Space+

PrecedenceInfo ::=
     PrecedenceAssoc ( SpacesOom PrecedenceOpe )+

PrecedenceAssoc ::=
     [LR]

PrecedenceOpe ::=
     ( ['] ( _NOT_  ( Space | ['] ) Char )* ['] )
    | ( ["] ( _NOT_  ( Space | ["] ) Char )* ["] )
    | ( _NOT_  ( PrecedenceAssoc | Space | '}' ) . )+

//Added tokens for railroad generation
_NOT_ ::= '!'
_AND_ ::= '&'
yhirose commented 2 years ago

@mingodad, at least I easily found three problems in your translated peg grammar. Here are my corrections.

IdentStart <- !"↑" !"⇑" ([a-zA-Z_%] / [\u0080-\uFFFF])

Range <- (Char '-' Char) / Char

Char <-
    '\\' [nrt'"[\]\\^]
    / '\\' [0-3] [0-7] [0-7]
    / '\\' [0-7] [0-7]?
    / "\\x" [0-9a-fA-F] [0-9a-fA-F]?
    / "\\u" (('0' [0-9a-fA-F]) / "10") [0-9a-fA-F]{4, 4} / [0-9a-fA-F]{4, 5}
    / !'\\' .

There seems to be more mistakes in the grammar, and I feel it's pretty dangerous to translate the C++ parser contaminators code to the PEG format by hand. You should carefully check the translated one is really valid with a generated AST. Hope it helps.