yhirose / cpp-peglib

A single file C++ header-only PEG (Parsing Expression Grammars) library
MIT License
916 stars 113 forks source link

This valid character class is not accepted [^] #218

Closed mingodad closed 2 years ago

mingodad commented 2 years ago

See this discussion https://github.com/ChrisHixon/chpeg/issues/1#issuecomment-1166246408 , bellow is the sample input using the possibly fixed cpp-peglib grammar:

Input:

start <- name1 / name2 
name1 <- [^][a-z]
name2 <- [^2][a-z]

AST:

+ Grammar
  + Definition
    + Ignore
    - Identifier (start)
    + LEFTARROW
    + Expression
      + Sequence/0[Primary]
        + Ignore
        - Identifier (name1)
      + Sequence/0[Primary]
        + Ignore
        - Identifier (name2)
  + Definition
    + Ignore
    - Identifier (name1)
    + LEFTARROW
    + Expression/0[Sequence]
      - Prefix/0[Class] (^)
      - Prefix/0[Class] (a-z)
  + Definition
    + Ignore
    - Identifier (name2)
    + LEFTARROW
    + Expression/0[Sequence]
      - Prefix/0[NegatedClass] (2)
      - Prefix/0[Class] (a-z)
  - EndOfFile ()

Proposed working grammar (manually extracted from peglib.h):

# Setup PEG syntax parser
Grammar <-  Spacing  Definition+  EndOfFile

Definition <-
    Ignore ( IdentCont Parameters / Identifier ) LEFTARROW Expression Instruction?

Expression <-  Sequence ( SLASH Sequence )*

Sequence <-  ( CUT / Prefix )*

Prefix <- ( AND / NOT )? SuffixWithLabel

SuffixWithLabel <- Suffix ( LABEL Identifier )?

Suffix <-  Primary Loop?

Loop <-  QUESTION /  STAR /  PLUS /  Repetition

Primary <-
    Ignore  IdentCont  Arguments !LEFTARROW
    / Ignore  Identifier !(Parameters?  LEFTARROW)
    / OPEN  Expression  CLOSE
    / BeginTok  Expression  EndTok
    / BeginCapScope  Expression  EndCapScope
    / BeginCap  Expression  EndCap
    / BackRef
    / LiteralI
    / Dictionary
    / Literal
    / NegatedClass
    / Class
    / DOT

Identifier <-  <IdentCont>  Spacing

IdentCont <- IdentStart  IdentRest*

IdentStart <-  !"↑"  !"⇑" ([a-zA-Z_%] / [\u0080-\uFFFF])

IdentRest <-  IdentStart /  [0-9]

Dictionary <-  LiteralD  (PIPE  LiteralD)+

lit_ope <-
    [']  <(![']  Char)*> [']  Spacing
    / ["]  <(!["]  Char)*> ["]  Spacing

Literal <-  lit_ope

LiteralD <-  lit_ope

LiteralI <-
    [']  <(![']  Char)*>  "'i" Spacing
    / ["]  <(!["]  Char)*>  '"i' Spacing

# NOTE: The original Brian Ford's paper uses 'zom' instead of 'oom'.
Class <-  '[' <(!']'  Range)+>  ']' Spacing
NegatedClass <-  "[^" <(!']'  Range)+>  ']' Spacing

Range <-  (Char  '-'  Char) /  Char

Char <-
    '\\'  [nrt'\"[\]\\^]
    / '\\'  [0-3]  [0-7]  [0-7]
    / '\\'  [0-7]  [0-7]?
    / "\\x"  [0-9a-fA-F]  [0-9a-fA-F]?
    / "\\u" (((('0' [0-9a-fA-F]) / "10") [0-9a-fA-F]{4,4}) / [0-9a-fA-F]{4,5})
    / !'\\'   .

Repetition <- BeginBlacket  RepetitionRange  EndBlacket

RepetitionRange <-
    Number ( COMMA Number? )?
    / COMMA  Number

Number <-  [0-9]+  Spacing

LEFTARROW <-  ("<-" / "←")  Spacing

~SLASH <-  '/'  Spacing
~PIPE <-  '|'  Spacing
AND <-  '&'  Spacing
NOT <-  '!'  Spacing
QUESTION <- '?'  Spacing
STAR <-  '*'  Spacing
PLUS <-  '+'  Spacing
~OPEN <-  '('  Spacing
~CLOSE <- ')'  Spacing
DOT <-  '.'  Spacing

CUT <-  "↑"  Spacing
~LABEL <-  ('^' /  "⇑")  Spacing

~Spacing <-  (Space /  Comment)*
Comment <- '#'  (!EndOfLine   . )*
Space <-  ' ' /  '\t' /  EndOfLine
EndOfLine <-  "\r\n" /  '\n' /  '\r'
EndOfFile <-  ! .

~BeginTok <-  '<'  Spacing
~EndTok <-  '>'  Spacing

~BeginCapScope <-  '$'  '('  Spacing
~EndCapScope <-  ')'  Spacing

BeginCap <-  '$'  <IdentCont>  '<'  Spacing
~EndCap <-  '>'  Spacing

BackRef <-  '$'  <IdentCont>  Spacing

IGNORE <-  '~'

Ignore <-  IGNORE?
Parameters <-  OPEN  Identifier (COMMA  Identifier)*  CLOSE
Arguments <-  OPEN  Expression (COMMA  Expression)*  CLOSE
~COMMA <-  ','  Spacing

# Instruction grammars
Instruction <-
    BeginBlacket (InstructionItem  (InstructionItemSeparator InstructionItem)*)? EndBlacket
InstructionItem <- PrecedenceClimbing /  ErrorMessage /  NoAstOpt
~InstructionItemSeparator <-  ';'  Spacing

~SpacesZom <-  Space*
~SpacesOom <-  Space+
~BeginBlacket <-  '{'  Spacing
~EndBlacket <-  '}'  Spacing

# PrecedenceClimbing instruction
PrecedenceClimbing <- "precedence"  (SpacesOom  PrecedenceInfo)+  SpacesZom
PrecedenceInfo <- PrecedenceAssoc (~SpacesOom  PrecedenceOpe)+
PrecedenceOpe <-
    ['] <(!(Space /  ['])  Char)*> [']
    / ["] <(!(Space /  ["])  Char)*> ["]
    / <(!(PrecedenceAssoc /  Space /  '}')  . )+>
PrecedenceAssoc <-  [LR]

# Error message instruction
ErrorMessage <- "message"  SpacesOom  LiteralD  SpacesZom

# No Ast node optimazation instruction
NoAstOpt <-  "no_ast_opt"  SpacesZom
yhirose commented 2 years ago

@mingodad, thanks for the report. This is actually an intended behavior to match the behavior of most regex engines.

> cat bug.peg
start <- name1 / name2
name1 <- [^][a-z]
name2 <- [^2][a-z]

> peglint bug.peg
bug.peg:2:12: syntax error

Here is the output form PCRE2:

image

In order to implement the same behavior, I added !'^' to Class in the Bryan Ford's original PEG syntax. Original: Class <- ’[’ (!’]’ Range)* ’]’ Spacing cpp-peglib: Class <- ’[’ !'^' (!’]’ Range)* ’]’ Spacing