yhirose / cpp-peglib

A single file C++ header-only PEG (Parsing Expression Grammars) library
MIT License
879 stars 112 forks source link

Why "IdentCont" isn't grouped ? #275

Closed mingodad closed 1 year ago

mingodad commented 1 year ago

Looking at the C++ grammar code I noticed that IdentCont is not grouped () and doing so the performance seem to be affected.

As it is we get this numbers from the playground when parsing itself:

duration: 0.0457s (45700µs)

  id       total      %     success        fail  definition
           30604              14790       15814  Total counters
                              48.33       51.67  % success/fail
...

But if we group IdentCont as IdentCont <- <(IdentStart (IdentRest)*)> then:

duration: 0.0313s (31300µs)

  id       total      %     success        fail  definition
           19182               5611       13571  Total counters
                              29.25       70.75  % success/fail
...

Here is the cpp-peglib grammar mechanically extracted from peglib.h:

    #// Setup PEG syntax parser
Grammar <- (Spacing  (Definition)+ EndOfFile)
Definition <-
        ((Ignore IdentCont Parameters LEFTARROW 
Expression  (Instruction)?) /
            (Ignore Identifier LEFTARROW Expression 
                (Instruction)?))
Expression <- (Sequence  ((SLASH Sequence))*)
Sequence <- ((CUT /Prefix))*
Prefix <- (((AND /NOT))? SuffixWithLabel)
SuffixWithLabel <-
        (Suffix  ((LABEL Identifier))?)
Suffix <- (Primary  (Loop)?)
Loop <- (QUESTION /STAR /PLUS /Repetition)
Primary <-
        ((Ignore IdentCont Arguments 
                !(LEFTARROW)) /
            (Ignore Identifier 
                !(((Parameters)? LEFTARROW))) /
            (OPEN Expression CLOSE) /
            (BeginTok Expression EndTok) /
            (BeginCapScope Expression EndCapScope) /
            (BeginCap Expression EndCap) /BackRef /
LiteralI /Dictionary /Literal /NegatedClass /
Class /DOT)

Identifier <- (IdentCont Spacing)
IdentCont <- <(IdentStart  (IdentRest)*)>

IdentStart <- (!("↑")  !("⇑") 
                           ([a-zA-Z_%] / [0x0080-0xFFFF]))

IdentRest <- (IdentStart / [0-9])

Dictionary <- (LiteralD  ((PIPE LiteralD))+)

    lit_ope <- (([']  <(((!([']) Char))*)> 
                           ['] Spacing) /
                       (["]  <(((!(["]) Char))*)> 
                           ["] Spacing))
Literal <- lit_ope
LiteralD <- lit_ope

LiteralI <-
        (([']  <(((!([']) Char))*)>  "'i" 
Spacing) /
            (["]  <(((!(["]) Char))*)>  "\"i" 
Spacing))

    #// NOTE: The original Brian Ford's paper uses 'zom' instead of 'oom'.
Class <- ('['  !('^') 
                      <(((!(']') Range))+)>  ']' 
Spacing)
NegatedClass <- ("[^" 
                             <(((!(']') Range))+)>  ']' 
Spacing)

    #// NOTE: This is different from The original Brian Ford's paper, and this
    #// modification allows us to specify `[+-]` as a valid char class.
Range <- ((Char  '-'  !(']') Char) /Char)

Char <-
        (('\\'  [abefnrtv'"\[\]\\^]) /
            ('\\'  [0-3]  [0-7]  [0-7]) /
            ('\\'  [0-7]  ([0-7])?) /
            ("\\x"  [0-9a-fA-F]  ([0-9a-fA-F])?) /
            ("\\u" 
                (((('0'  [0-9a-fA-F]) / "10") 
                        ([0-9a-fA-F]{4,4})) /
                    ([0-9a-fA-F]{4,5}))) /
            (!('\\')  .))

Repetition <-
        (BeginBlacket RepetitionRange EndBlacket)
RepetitionRange <- ((Number COMMA Number) /
                                (Number COMMA) /Number /
                                (COMMA Number))
Number <- (([0-9])+ Spacing)

LEFTARROW <- (("<-" / "←") Spacing)
~SLASH <- ('/' Spacing)
~PIPE <- ('|' Spacing)
AND <- ('&' Spacing)
NOT <- ('!' Spacing)
QUESTION <- ('?' Spacing)
STAR <- ('*' Spacing)
PLUS <- ('+' Spacing)
~OPEN <- ('(' Spacing)
~CLOSE <- (')' Spacing)
DOT <- ('.' Spacing)

CUT <- ("↑" Spacing)
~LABEL <- (('^' / "⇑") Spacing)

~Spacing <- ((Space /Comment))*
Comment <-
        ('#'  ((!(EndOfLine)  .))* EndOfLine)
Space <- (' ' / '\t' /EndOfLine)
EndOfLine <- ("\r\n" / '\n' / '\r')
EndOfFile <- !(.)

~BeginTok <- ('<' Spacing)
~EndTok <- ('>' Spacing)

~BeginCapScope <- ('$'  '(' Spacing)
~EndCapScope <- (')' Spacing)

BeginCap <- ('$'  <(IdentCont)>  '<' Spacing)
~EndCap <- ('>' Spacing)

BackRef <- ('$'  <(IdentCont)> Spacing)

IGNORE <- '~'

Ignore <- (IGNORE)?
Parameters <- (OPEN Identifier 
                           ((COMMA Identifier))* CLOSE)
Arguments <- (OPEN Expression 
                          ((COMMA Expression))* CLOSE)
~COMMA <- (',' Spacing)

    #// Instruction grammars
Instruction <-
        (BeginBlacket 
            ((InstructionItem  ((InstructionItemSeparator 
InstructionItem))*))? 
EndBlacket)
InstructionItem <-
        (PrecedenceClimbing /ErrorMessage /NoAstOpt)
~InstructionItemSeparator <- (';' Spacing)

~SpacesZom <- (Space)*
~SpacesOom <- (Space)+
~BeginBlacket <- ('{' Spacing)
~EndBlacket <- ('}' Spacing)

    #// PrecedenceClimbing instruction
PrecedenceClimbing <-
        ("precedence" SpacesOom PrecedenceInfo 
            ((SpacesOom PrecedenceInfo))* SpacesZom)
PrecedenceInfo <-
        (PrecedenceAssoc 
            ((&(SpacesOom) PrecedenceOpe))+)
PrecedenceOpe <-
        ((['] 
                <(((!((Space / ['])) Char))*)> 
                [']) /
            (["] 
                <(((!((Space / ["])) Char))*)> 
                ["]) /
            <(((!((PrecedenceAssoc /Space / '}')) 
                        .))+)>)
PrecedenceAssoc <- [LR]

    #// Error message instruction
ErrorMessage <-
        ("message" SpacesOom LiteralD SpacesZom)

    #// No Ast node optimazation instruction
NoAstOpt <- ("no_ast_opt" SpacesZom)

Here is the relevant C++ grammar code:

    g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]);
    g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"]));
yhirose commented 1 year ago

@mingodad thanks for the report!