Closed mingodad closed 1 year ago
Looking at the C++ grammar code I noticed that IdentCont is not grouped () and doing so the performance seem to be affected.
As it is we get this numbers from the playground when parsing itself:
duration: 0.0457s (45700µs) id total % success fail definition 30604 14790 15814 Total counters 48.33 51.67 % success/fail ...
But if we group IdentCont as IdentCont <- <(IdentStart (IdentRest)*)> then:
IdentCont <- <(IdentStart (IdentRest)*)>
duration: 0.0313s (31300µs) id total % success fail definition 19182 5611 13571 Total counters 29.25 70.75 % success/fail ...
Here is the cpp-peglib grammar mechanically extracted from peglib.h:
#// Setup PEG syntax parser Grammar <- (Spacing (Definition)+ EndOfFile) Definition <- ((Ignore IdentCont Parameters LEFTARROW Expression (Instruction)?) / (Ignore Identifier LEFTARROW Expression (Instruction)?)) Expression <- (Sequence ((SLASH Sequence))*) Sequence <- ((CUT /Prefix))* Prefix <- (((AND /NOT))? SuffixWithLabel) SuffixWithLabel <- (Suffix ((LABEL Identifier))?) Suffix <- (Primary (Loop)?) Loop <- (QUESTION /STAR /PLUS /Repetition) Primary <- ((Ignore IdentCont Arguments !(LEFTARROW)) / (Ignore Identifier !(((Parameters)? LEFTARROW))) / (OPEN Expression CLOSE) / (BeginTok Expression EndTok) / (BeginCapScope Expression EndCapScope) / (BeginCap Expression EndCap) /BackRef / LiteralI /Dictionary /Literal /NegatedClass / Class /DOT) Identifier <- (IdentCont Spacing) IdentCont <- <(IdentStart (IdentRest)*)> IdentStart <- (!("↑") !("⇑") ([a-zA-Z_%] / [0x0080-0xFFFF])) IdentRest <- (IdentStart / [0-9]) Dictionary <- (LiteralD ((PIPE LiteralD))+) lit_ope <- ((['] <(((!([']) Char))*)> ['] Spacing) / (["] <(((!(["]) Char))*)> ["] Spacing)) Literal <- lit_ope LiteralD <- lit_ope LiteralI <- ((['] <(((!([']) Char))*)> "'i" Spacing) / (["] <(((!(["]) Char))*)> "\"i" Spacing)) #// NOTE: The original Brian Ford's paper uses 'zom' instead of 'oom'. Class <- ('[' !('^') <(((!(']') Range))+)> ']' Spacing) NegatedClass <- ("[^" <(((!(']') Range))+)> ']' Spacing) #// NOTE: This is different from The original Brian Ford's paper, and this #// modification allows us to specify `[+-]` as a valid char class. Range <- ((Char '-' !(']') Char) /Char) Char <- (('\\' [abefnrtv'"\[\]\\^]) / ('\\' [0-3] [0-7] [0-7]) / ('\\' [0-7] ([0-7])?) / ("\\x" [0-9a-fA-F] ([0-9a-fA-F])?) / ("\\u" (((('0' [0-9a-fA-F]) / "10") ([0-9a-fA-F]{4,4})) / ([0-9a-fA-F]{4,5}))) / (!('\\') .)) Repetition <- (BeginBlacket RepetitionRange EndBlacket) RepetitionRange <- ((Number COMMA Number) / (Number COMMA) /Number / (COMMA Number)) Number <- (([0-9])+ Spacing) LEFTARROW <- (("<-" / "←") Spacing) ~SLASH <- ('/' Spacing) ~PIPE <- ('|' Spacing) AND <- ('&' Spacing) NOT <- ('!' Spacing) QUESTION <- ('?' Spacing) STAR <- ('*' Spacing) PLUS <- ('+' Spacing) ~OPEN <- ('(' Spacing) ~CLOSE <- (')' Spacing) DOT <- ('.' Spacing) CUT <- ("↑" Spacing) ~LABEL <- (('^' / "⇑") Spacing) ~Spacing <- ((Space /Comment))* Comment <- ('#' ((!(EndOfLine) .))* EndOfLine) Space <- (' ' / '\t' /EndOfLine) EndOfLine <- ("\r\n" / '\n' / '\r') EndOfFile <- !(.) ~BeginTok <- ('<' Spacing) ~EndTok <- ('>' Spacing) ~BeginCapScope <- ('$' '(' Spacing) ~EndCapScope <- (')' Spacing) BeginCap <- ('$' <(IdentCont)> '<' Spacing) ~EndCap <- ('>' Spacing) BackRef <- ('$' <(IdentCont)> Spacing) IGNORE <- '~' Ignore <- (IGNORE)? Parameters <- (OPEN Identifier ((COMMA Identifier))* CLOSE) Arguments <- (OPEN Expression ((COMMA Expression))* CLOSE) ~COMMA <- (',' Spacing) #// Instruction grammars Instruction <- (BeginBlacket ((InstructionItem ((InstructionItemSeparator InstructionItem))*))? EndBlacket) InstructionItem <- (PrecedenceClimbing /ErrorMessage /NoAstOpt) ~InstructionItemSeparator <- (';' Spacing) ~SpacesZom <- (Space)* ~SpacesOom <- (Space)+ ~BeginBlacket <- ('{' Spacing) ~EndBlacket <- ('}' Spacing) #// PrecedenceClimbing instruction PrecedenceClimbing <- ("precedence" SpacesOom PrecedenceInfo ((SpacesOom PrecedenceInfo))* SpacesZom) PrecedenceInfo <- (PrecedenceAssoc ((&(SpacesOom) PrecedenceOpe))+) PrecedenceOpe <- ((['] <(((!((Space / ['])) Char))*)> [']) / (["] <(((!((Space / ["])) Char))*)> ["]) / <(((!((PrecedenceAssoc /Space / '}')) .))+)>) PrecedenceAssoc <- [LR] #// Error message instruction ErrorMessage <- ("message" SpacesOom LiteralD SpacesZom) #// No Ast node optimazation instruction NoAstOpt <- ("no_ast_opt" SpacesZom)
Here is the relevant C++ grammar code:
g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]); g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"]));
@mingodad thanks for the report!
Looking at the
grammar code I noticed thatIdentCont
is not grouped () and doing so the performance seem to be affected.As it is we get this numbers from the playground when parsing itself:
But if we group
asIdentCont <- <(IdentStart (IdentRest)*)>
then:Here is the
grammar mechanically extracted frompeglib.h
:Here is the relevant
grammar code: