Closed ChrisHixon closed 2 years ago
I think that transforming the examples/peg_parse_file.c
in a tool like cpp-peglib/lint/peglint.cpp
that would be main tool to develop/test grammars with command line options to trace, generate cbytecode, generate ast in json/other format for easy interoperability with other tools like https://astexplorer.net/ .
I have a 'refactor' branch locally where I've done a lot of the above. It should possible to 'bootstrap' without Ruby since I have completed bytecode output to C code (two functions, one for .h and one for the .c) that works within the new refactored design, but there is no util to do this yet. I will probably start with small separate utilities, but it would be cool to eventually have a sort of do-everything utility.
Push it here so I can also play with it and try find a way to achieve that.
It's not in a good state yet, but... Coming Soon.
I just pushed the refactor branch: https://github.com/ChrisHixon/chpeg/tree/refactor
I'm looking at it right now !
At first the char *esc_bytes(const unsigned char *bytes, int length, int limit);
is declared in src/parser.h
and src/util.h
.
Thanks, fixed that.
Here is a possible way to add an option to generate the cbytecode:
diff --git a/examples/parse_file.c b/examples/parse_file.c
index ae972de..8ca44c9 100644
--- a/examples/parse_file.c
+++ b/examples/parse_file.c
@@ -101,18 +101,24 @@ int main(int argc, char *argv[])
int ret = 0;
char *grammar_filename = NULL;
char *input_filename = NULL;
+ int gen_cbytecode = 0;
#ifdef DEBUG_MEM
mtrace();
#endif
if (argc < 2 || argc > 3) {
- fprintf(stderr, "usage: %s [<grammar>] <input>\n", argv[0]);
+ fprintf(stderr, "usage: %s [--cbytecode] <grammar> [<input>]\n", argv[0]);
ret = 1;
goto done;
}
- if (argc == 2) {
+ gen_cbytecode = strcmp(argv[1], "--cbytecode") == 0;
+
+ if(gen_cbytecode && argc == 3) {
+ grammar_filename = argv[2];
+ }
+ else if (argc == 2) {
input_filename = argv[1];
}
else if (argc == 3) {
@@ -144,6 +150,10 @@ int main(int argc, char *argv[])
// uncomment to print a dump of the byte code (defs, instructions, and strings)
//ByteCode_print(byte_code);
+ if(gen_cbytecode) {
+ ByteCode_output_c(byte_code, stdout, grammar_filename, NULL);
+ goto done;
+ }
free(input);
input = NULL;
The cbytecode generated is missing the #define PEG_*
.
for(int i=0; i < bc->num_defs; ++i) {
const char *def_name = bc->def_names[i];
fprintf(fp, "#define PEG_");
for(int j=0, jmax=strlen(def_name); j < jmax; ++j)
fputc(toupper(def_name[j]), fp);
fprintf(fp, " %d\n", i);
}
fprintf(fp, "\n");
A small fix for the usage message:
diff --git a/examples/parse_file.c b/examples/parse_file.c
index ae972de..6f29374 100644
--- a/examples/parse_file.c
+++ b/examples/parse_file.c
@@ -101,6 +101,7 @@ int main(int argc, char *argv[])
int ret = 0;
char *grammar_filename = NULL;
char *input_filename = NULL;
+ int gen_cbytecode = 0;
#ifdef DEBUG_MEM
mtrace();
@@ -108,11 +109,17 @@ int main(int argc, char *argv[])
if (argc < 2 || argc > 3) {
fprintf(stderr, "usage: %s [<grammar>] <input>\n", argv[0]);
+ fprintf(stderr, " or: %s --cbytecode <grammar>\n", argv[0]);
ret = 1;
goto done;
}
- if (argc == 2) {
+ gen_cbytecode = strcmp(argv[1], "--cbytecode") == 0;
+
+ if(gen_cbytecode && argc == 3) {
+ grammar_filename = argv[2];
+ }
+ else if (argc == 2) {
input_filename = argv[1];
}
else if (argc == 3) {
@@ -144,6 +151,10 @@ int main(int argc, char *argv[])
// uncomment to print a dump of the byte code (defs, instructions, and strings)
//ByteCode_print(byte_code);
+ if(gen_cbytecode) {
+ ByteCode_output_c(byte_code, stdout, grammar_filename, NULL);
+ goto done;
+ }
free(input);
input = NULL;
Also it's missing the memory allocation wrappers that could go on util.h
and in util.c
we could have a default custom memory allocation wrapper that calls exit
if the allocation fails.
Check out test/bytecode_output_c_pt1_output.c. Bytecode output has a .c and .h file.
#include <stdio.h>
#include "chpeg_bytecode.h"
// output chpeg_bytecode as test_bytecode.c & test_bytecode.h files
int main(void)
{
FILE *fp;
fp = fopen("generated/test_bytecode.c", "w");
if (!fp) {
perror("test_bytecode.c");
return 1;
}
ByteCode_output_c(&chpeg_bytecode, fp, "test_bytecode", NULL);
fclose(fp);
fp = fopen("generated/test_bytecode.h", "w");
if (!fp) {
perror("test_bytecode.h");
return 1;
}
ByteCode_output_h(&chpeg_bytecode, fp, "test_bytecode", NULL, "chpeg", NULL);
fclose(fp);
}
Using your example I updated my proposal to include it in examples/parse_file.c
or in my case ./chpeg.c
:
diff --git a/.gitignore b/.gitignore
index 3268211..1e4beb6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
.*.sw?
+/chpeg-nb/build/
+/chpeg-nb/dist/
\ No newline at end of file
diff --git a/examples/parse_file.c b/examples/parse_file.c
index ae972de..423b9ed 100644
--- a/examples/parse_file.c
+++ b/examples/parse_file.c
@@ -91,6 +91,11 @@ done:
return ret;
}
+void usage(const char *prog) {
+ fprintf(stderr, "usage: %s [<grammar>] <input>\n", prog);
+ fprintf(stderr, " or: %s --cbytecode basename <grammar>\n", prog);
+}
+
int main(int argc, char *argv[])
{
unsigned char *input = NULL;
@@ -101,18 +106,31 @@ int main(int argc, char *argv[])
int ret = 0;
char *grammar_filename = NULL;
char *input_filename = NULL;
+ char *base_filename = NULL;
+ int gen_cbytecode = 0;
#ifdef DEBUG_MEM
mtrace();
#endif
- if (argc < 2 || argc > 3) {
- fprintf(stderr, "usage: %s [<grammar>] <input>\n", argv[0]);
+ if (argc < 2 || argc > 4) {
+ usage(argv[0]);
ret = 1;
goto done;
}
- if (argc == 2) {
+ gen_cbytecode = strcmp(argv[1], "--cbytecode") == 0;
+
+ if(gen_cbytecode) {
+ if(argc != 4) {
+ usage(argv[0]);
+ ret = 1;
+ goto done;
+ }
+ base_filename = argv[2];
+ grammar_filename = argv[3];
+ }
+ else if (argc == 2) {
input_filename = argv[1];
}
else if (argc == 3) {
@@ -144,6 +162,28 @@ int main(int argc, char *argv[])
// uncomment to print a dump of the byte code (defs, instructions, and strings)
//ByteCode_print(byte_code);
+ if(gen_cbytecode) {
+ FILE *fp;
+ char strbuf[1024];
+ snprintf(strbuf, sizeof(strbuf), "%s.c", base_filename);
+ fp = fopen(strbuf, "w");
+ if (!fp) {
+ perror(strbuf);
+ return 1;
+ }
+ ByteCode_output_c(byte_code, fp, base_filename, NULL);
+ fclose(fp);
+
+ snprintf(strbuf, sizeof(strbuf), "%s.h", base_filename);
+ fp = fopen(strbuf, "w");
+ if (!fp) {
+ perror(strbuf);
+ return 1;
+ }
+ ByteCode_output_h(byte_code, fp, base_filename, NULL, base_filename, NULL);
+ fclose(fp);
+ goto done;
+ }
free(input);
input = NULL;
Some quick notes for compiler/parser hacking
The bytecode the compiler uses is in src/chpeg_bytecode.c
and src/chpeg_bytecode.h
These files are hand made starting with the Ruby bootstap generated versions, but they are nearly identical to what the ByteCode_output_c/h
functions output.
The op codes the parser uses are in src/opcodes.c
and src/opcodes.h
You can change the bytecode the compiler uses by changing #include "chpeg_bytecode.h"
in src/compiler.c
to another file. I think when developing a new version, it would be a good idea to change this to include your new byte code file rather than just replacing chpeg_bytecode.c/h. It will make it more clear what is going on, what version your compiler is currently using, etc.
Bytecode is targeted to a specific set of op codes; a bit of a reminder of this: the generated bytecode .h file has an include for the "opcodes.h" which contains the op codes it is using to encode instructions. If you're changing op codes, I'd suggest making a new .c/.h file for those, and set your bytecode output to use your new opcodes (one of the args to ByteCode_output_h()
. Then change the other places that include opcodes.h
to include your new opcodes:
src/parser.c:#include "opcodes.h"
src/compiler.c:#include "opcodes.h"
src/bytecode.c:#include "opcodes.h"
Again I think it'd be good to use another file rather than changing opcodes.c/h directly, so it's clear what's going on. Actually I think you'd have to change the opcode includes ahead of time before generating the new compiler byte code. It could be a little tricky. I haven't done this process yet and it definitely needs to be documented when someone does it.
Also it's missing the memory allocation wrappers that could go on
util.h
and inutil.c
we could have a default custom memory allocation wrapper that callsexit
if the allocation fails.
Yes it would be good to put in the exit
if allocation fails wrappers. I'm not sure what you mean about util.h
and in util.c
though.
I mean: In src/util.h:
#ifndef CHPEG_MALLOC
#define CHPEG_MALLOC(sz) chpeg_malloc(sz)
#define CHPEG_REALLOC(ptr, sz) chpeg_realloc(ptr, sz)
#define CHPEG_CALLOC(count, sz) chpeg_calloc(count, sz)
#define CHPEG_FREE(ptr) chpeg_free(ptr)
#endif
In src/util.c:
void *chpeg_malloc(size_t sz) {
void *ptr = malloc(sz);
if(ptr == NULL)
{
fprintf(stderr, "Failed to allocate %zd bytes.\n", sz);
exit(-1);
}
return ptr;
}
...
void chpeg_free(void *ptr) {
free(ptr);
}
I'll play a bit with this code and I'll tell you what I think about it and if I have any suggestion I'll also tell you. Thank you so much for your great work, time and dedication ! :+1:
I just did some dogfooding trying to extend chpeg
using this branch and made several changes to allow it.
I added some examples that uses an amalgamation of chpeg
:
examples/mk-chpeg.sh
creates an amalgamation including src/*
and examples/parse_file.c
and build itexamples/mk-chpeg-hex.sh
create cbytecode from examples/chpeg-hex.chpeg
uses a config file examples/chpeg-hex-cfg.h
as the first file in the amalgamation and build itexamples/mk-chpeg-nocase.sh
create cbytecode from examples/cbyte-nocase.peg
uses a config file examples/chpeg-nocase-cfg.h
as the first file in the amalgamation and build it, this one is not working properly and appreciate any help figuring out how to extend chpeg
to work with this grammar.Now that you've done all prefixing probably it's time to also prefix the filenames ins src/*
, maybe something like:
bytecode.c -> chpeg_bytecode.c
bytecode.h -> chpeg_bytecode.h
chpeg_bytecode.c -> chpeg_chpeg_bytecode.c
chpeg_bytecode.h -> chpeg_chpeg_bytecode.h
compiler.c -> chpeg_compiler.c
compiler.h -> chpeg_compiler.h
mem.c -> chpeg_mem.c
mem.h -> chpeg_mem.h
opcodes.c -> chpeg_opcodes.c
opcodes.h -> chpeg_opcodes.h
parser.c -> chpeg_parser.c
parser.h -> chpeg_parser.h
util.c -> chpeg_util.c
util.h -> chpeg_util.h
Or have the header files in a separated folder and refer to then prefixed like:
include/chpeg/bytecode.h
include/chpeg/chpeg_bytecode.h
include/chpeg/compiler.h
include/chpeg/mem.h
include/chpeg/opcodes.h
include/chpeg/parser.h
include/chpeg/util.h
src/bytecode.c
src/chpeg_bytecode.c
src/compiler.c
src/mem.c
src/opcodes.c
src/parser.c
src/util.c
And then:
#include "chpeg/bytecode.h"
I went with the second approach, and added an amalgamation process; the generated file is include/chpeg/chpeg.h
.
Now I think that the chances of clash/conflict with other people code have been mostly solved. I'll play with it again !
While playing with your last refactor
branch I found that the ChpegFlags
could clash/conflict with custom user defined grammar definitions so I've added _FLAG
to then:
enum ChpegFlags {
CHPEG_FLAG_STOP = 1<<0, // stop automatic unwrapping, forcing this node to be a container
CHPEG_FLAG_IGNORE = 1<<1, // deletes nodes matching this identifier
CHPEG_FLAG_LEAF = 1<<2, // collects this node and anything underneath as a final leaf (text) node
};
Also to avoid clash/conflict with custom user defined grammar definitions so I've added _BC
to default prefix:
#define CHPEG_BC_GRAMMAR 0
#define CHPEG_BC_DEFINITION 1
#define CHPEG_BC_CHOICE 2
#define CHPEG_BC_SEQUENCE 3
#define CHPEG_BC_PREDICATE 4
#define CHPEG_BC_REPEAT 5
#define CHPEG_BC_PRIMARY 6
#define CHPEG_BC_OPTIONS 7
#define CHPEG_BC_IDENTIFIER 8
#define CHPEG_BC_LITERAL 9
#define CHPEG_BC_CHARCLASS 10
#define CHPEG_BC_CHARRANGE 11
#define CHPEG_BC_CHAR 12
#define CHPEG_BC_ESCCHAR 13
#define CHPEG_BC_OCTCHAR 14
#define CHPEG_BC_PLAINCHAR 15
#define CHPEG_BC_PREDOP 16
#define CHPEG_BC_REPOP 17
#define CHPEG_BC_DOT 18
#define CHPEG_BC_S 19
Also for myself I just droped include/chpeg/mem.h
and include/chpeg/chpeg_api.h
and moved their contents to `include/chpeg/util.h
.
Also added some macros
to facilitate code reuse when customizing the default grammar/compiler.
Updated my examples and still need some help to make examples/mk-chpeg-nocase.sh
work with cbyte-nocase.peg
grammar.
Changed ChpegParser_print_error
to output better error messages.
...
I think now it's time to dog food chpeg
with any project of our interest to see what's missing or can be improved.
What kind of usage do you have in mind to use chpeg
?
I would propose to replicate this Lua parser https://github.com/edubart/lpegrex/blob/main/parsers/lua.lua .
Also probably you already know, but just in case, this project https://github.com/zyedidia/gpeg has an interesting paper https://zyedidia.github.io/notes/yedidia_thesis.pdf .
I added an initial version of the chpeg utility to the refactor branch
[~/Projects/GitHub/chpeg/util]% ./chpeg --help
Usage: ./chpeg [GRAMMAR] FILE | [--ACTION [OPTION | FILE]... ]...
Global options:
-g GRAMMAR compile grammar from file GRAMMAR
-G STRING compile grammar specified as STRING
-p FILE parse FILE, using grammar from -g/-G, if present
-P STRING parse STRING, using grammar from -g/-G, if present
Global debugging and verbosity options:
-d increase debug level
-dN set debug level to N (use 0 to disable)
-v increase verbosity
-vN set verbosity to N (use 0 to disable)
Actions: (in form --ACTION)
--ast print parse tree/ast (default action)
--bytecode print bytecode
--bytecodec print bytecode as C language source code
Help:
./chpeg --help print global help
./chpeg --ACTION --help print help for ACTION
Note the options -G and -P to specify grammar and data to parse on the command line... a quick playground of sorts.
[~/Projects/GitHub/chpeg/util]% ./chpeg -G 'A<-"a"*' -P aaaa
offset len id dp flg ident "data"
0 4 0 0 --- A "aaaa"
[~/Projects/GitHub/chpeg/util]% ./chpeg -G 'A<-"a"*' --bytecodec -basename aparser
#ifndef CHPEG_APARSER_H
#define CHPEG_APARSER_H
#ifndef CHPEG_AMALGAMATION
#include "chpeg/chpeg_api.h"
#include "chpeg/bytecode.h"
#include "chpeg/opcodes.h"
#endif
#define CHPEG_A 0
CHPEG_API const ChpegByteCode aparser;
#endif // #ifndef CHPEG_APARSER_H
#ifndef CHPEG_AMALGAMATION
#include "aparser.h"
#endif
CHPEG_DEF const ChpegByteCode aparser = {
.num_defs = 1,
.def_names = (char*[1]) {"A"},
.def_flags = (int[1]) {0},
.def_addrs = (int[1]) {2}, // presubtracted by 1
.num_instructions = 10,
.instructions = (int[10]) {
/* 0 */ CHPEG_INST(CHPEG_OP_IDENT , 0), /* A */
/* 1 */ CHPEG_INST(CHPEG_OP_FAIL , 0),
/* 2 */ CHPEG_INST(CHPEG_OP_SUCC , 0),
/* 3 */ CHPEG_INST(CHPEG_OP_RSBEG , 0),
/* 4 */ CHPEG_INST(CHPEG_OP_LIT , 0), /* "a" */
/* 5 */ CHPEG_INST(CHPEG_OP_GOTO , 6),
/* 6 */ CHPEG_INST(CHPEG_OP_RSMAT , 3),
/* 7 */ CHPEG_INST(CHPEG_OP_RSDONE , 0),
/* 8 */ CHPEG_INST(CHPEG_OP_ISUCC , 0), /* A */
/* 9 */ CHPEG_INST(CHPEG_OP_IFAIL , 0),
},
.num_strings = 1,
.strings = (unsigned char**)(char*[1]) {"a"},
.str_len = (int[1]) {1}
};
Yes it would be good to prefix the flags and bytecode def/rule constants.
The chpeg util can be built with the VM tracing/print tree features enabled. The Makefile in util builds a separate binary called chpeg-trace with these features enabled. The --help shows how to use them.
Tracing options:
-t increase VM trace level
-tN set VM trace level to N (0=off,1=on)
-tp increase VM print tree level
-tpN set VM print tree level to N (0=off,1=on)
Unexpected white space 16 5 4 | L | SETS "set\n\t"
while testing this grammar:
start <- decl* !.
decl <- _ SETS ident semi
ident {L} <- [a-zA-Z]+ _
semi <- ';' _
SETS {L} <- SET "s"? _
SET <- "set"
_ {I} <- [ \t\n\r]*
With this input:
sets I;
set j;
set
k;
I'm getting this output:
./parse_file sets.peg sets.txt
Parse successful.
Grammar file compiled successfully. Parser returned: 0
Parse successful.
---------------------------------------------------------------------------------
Begin Len DefID Flags Def. Name / Data
---------------------------------------------------------------------------------
0 24 0 | | start "sets I;\nset j;\nset\n\tk;\n"
0 9 1 | | decl "sets I;\n"
0 6 4 | L | SETS "sets "
6 1 2 | L | ident "I"
7 2 3 | | semi ";\n"
9 7 1 | | decl "set j;\n"
9 4 4 | L | SETS "set "
13 1 2 | L | ident "j"
14 2 3 | | semi ";\n"
16 8 1 | | decl "set\n\tk;\n"
16 5 4 | L | SETS "set\n\t"
21 1 2 | L | ident "k"
22 2 3 | | semi ";\n"
---------------------------------------------------------------------------------
I was expecting no white space (_ {I} <- [ \t\n\r]*
) like 16 5 4 | L | SETS "set"
and 22 2 3 | | semi ";"
This peglib
grammar output as expected:
start <- decl* !.
decl <- _ SETS ident semi
ident <- <[a-zA-Z]+> _
semi <- ';' _
SETS <- <SET "s"?> _
SET <- "set"
~_ <- [ \t\n\r]*
Output:
+ start
+ decl
- SETS (sets)
- ident (I)
+ semi
+ decl
- SETS (set)
- ident (j)
+ semi
+ decl
- SETS (set)
- ident (k)
+ semi
There is no equivalent of the < > operators in chpeg. Everything on the right hand side of a rule will be captured, where 'captured' is simply recording offset+length of the entire match, whether any tree nodes are ignored or not. Ignore, {I} will remove the parse tree node but not anything captured by offset+length in a parent rule. To fix this you either need separate capture rules or move the white space up to higher level rules.
Separate capture rules:
start <- decl* !.
decl <- _ SETS ident semi
ident <- ident_cap _
ident_cap <- [a-zA-Z]+
semi <- ';' _
SETS <- SETS_CAP _
SETS_CAP <- SET "s"?
SET <- "set"
_ {I} <- [ \t\n\r]*
Moving whitespace up to decl, everything captures what it is meant to refer to:
start <- decl* !.
decl <- _ SETS _ ident _ semi _
ident <- [a-zA-Z]+
semi <- ';'
SETS {L} <- SET "s"?
SET <- "set"
_ {I} <- [ \t\n\r]*
I certainly prefer the latter, and this should work in any PEG implementation omitting the chpeg Options {I},{L}.
AST from this:
offset len id dp flg ident "data"
0 24 0 0 --- start "sets I;\nset j;\nset\n\tk;\n"
0 9 1 1 --- decl "sets I;\n"
0 4 4 2 --L SETS "sets"
6 1 2 2 --- ident "I"
7 1 3 2 --- semi ";"
9 7 1 1 --- decl "set j;\n"
9 3 4 2 --L SETS "set"
13 1 2 2 --- ident "j"
14 1 3 2 --- semi ";"
16 8 1 1 --- decl "set\n\tk;\n"
16 3 4 2 --L SETS "set"
21 1 2 2 --- ident "k"
22 1 3 2 --- semi ";"
We've discussed this a couple times before and I always find it puzzling people want to include white space in a 'token' to be captured by the AST. Think about it this way: numbers do not have white space, identifiers do not have white space, etc.
Peglib equivalent seems to be:
start <- decl* !.
decl <- _ SETS _ ident _ semi _
ident <- [a-zA-Z]+
semi <- ';'
SETS <- < SET "s"? >
SET <- "set"
~_ <- [ \t\n\r]*
Another variation:
start <- _ decl* !.
decl <- SETS _ ident _ semi _
ident <- [a-zA-Z]+
semi {I} <- ';'
SETS {L} <- SET "s"?
SET <- "set"
_ {I} <- [ \t\n\r]*
I find it best/easiest to deal white space this way, and be consistent with it... Initial white space in start rule; trailing white space in the higher level rules; and AST nodes capturing what we actually want in them. Also this example ignores semi, since I'm guessing you don't really need to know it's there in the AST. Its purpose is to terminate the decl.
offset len id dp flg ident "data"
0 24 0 0 --- start "sets I;\nset j;\nset\n\tk;\n"
0 9 1 1 --- decl "sets I;\n"
0 4 4 2 --L SETS "sets"
6 1 2 2 --- ident "I"
9 7 1 1 --- decl "set j;\n"
9 3 4 2 --L SETS "set"
13 1 2 2 --- ident "j"
16 8 1 1 --- decl "set\n\tk;\n"
16 3 4 2 --L SETS "set"
21 1 2 2 --- ident "k"
@mingodad,
I thought about this more and came up with a simple way to implement < and >. I call these 'trim' operators because they trim the start (offset) or end (length) of the current node when the TRIM instruction is called.
Implemented in this branch: https://github.com/ChrisHixon/chpeg/tree/trim-operators
Without TRIM:
Expr <- _ Sum EOF
Sum <- Product (PLUS_MINUS Product)*
Product <- Value (MUL_DIV Value)*
Value <- NUMBER / ParenExpr
ParenExpr <- LPAREN Sum RPAREN
PLUS_MINUS <- [+-] _
MUL_DIV <- [*/] _
NUMBER <- ("0" / [1-9][0-9]*) _
LPAREN <- '(' _
RPAREN <- ')' _
_ {I} <- [ \t\r\n]*
EOF <- !.
Test/Output:
% util/chpeg -v -g test/grammars/calc-ts.chpeg -P ' 1 + 2 * 3 / 4 '
Compiling grammar file: 'test/grammars/calc-ts.chpeg'
Grammar file 'test/grammars/calc-ts.chpeg' compiled successfully. Parser returned: 0
Parsing string: " 1 + 2 * 3 / 4 "
Parse successful.
offset len id dp flg ident "data"
0 20 0 0 --- Expr " 1 + 2 * 3 / 4 "
2 18 1 1 --- Sum "1 + 2 * 3 / 4 "
2 3 7 2 --- NUMBER "1 "
5 2 5 2 --- PLUS_MINUS "+ "
7 13 2 2 --- Product "2 * 3 / 4 "
7 4 7 3 --- NUMBER "2 "
11 2 6 3 --- MUL_DIV "* "
13 2 7 3 --- NUMBER "3 "
15 2 6 3 --- MUL_DIV "/ "
17 3 7 3 --- NUMBER "4 "
20 0 11 1 --- EOF ""
With TRIM:
Expr <- _ Sum EOF
Sum <- Product (PLUS_MINUS Product)*
Product <- Value (MUL_DIV Value)*
Value <- NUMBER / ParenExpr
ParenExpr <- LPAREN Sum RPAREN
PLUS_MINUS <- < [+-] > _
MUL_DIV <- < [*/] > _
NUMBER <- < ("0" / [1-9][0-9]*) > _
LPAREN <- < '(' > _
RPAREN <- < ')' > _
_ {I} <- [ \t\r\n]*
EOF <- !.
Test/Output:
% util/chpeg -v -g test/grammars/calc-trim.chpeg -P ' 1 + 2 * 3 / 4 '
Compiling grammar file: 'test/grammars/calc-trim.chpeg'
Grammar file 'test/grammars/calc-trim.chpeg' compiled successfully. Parser returned: 0
Parsing string: " 1 + 2 * 3 / 4 "
Parse successful.
offset len id dp flg ident "data"
0 20 0 0 --- Expr " 1 + 2 * 3 / 4 "
2 18 1 1 --- Sum "1 + 2 * 3 / 4 "
2 1 7 2 --- NUMBER "1"
5 1 5 2 --- PLUS_MINUS "+"
7 13 2 2 --- Product "2 * 3 / 4 "
7 1 7 3 --- NUMBER "2"
11 1 6 3 --- MUL_DIV "*"
13 1 7 3 --- NUMBER "3"
15 1 6 3 --- MUL_DIV "/"
17 1 7 3 --- NUMBER "4"
20 0 11 1 --- EOF ""
Would it work also with other characters like in several grammars they have this for strings:
string <- '"' < [^\"]* > '"' _
Yes, it should work for trimming off the match offset/length of anything. Sub-nodes can still exist so you probably want to make sure anything trimmed off is ignored or the capture rule is {L}
. <
and >
don't need to match up either. Currently they can be in the reverse order and I can think of some bugs that can happen. I'm pondering whether they should have to match up or not and what exactly should be allowed or flagged as errors. It could be a very slight optimization to leave one off:
NUMBER <- [0-9]* >_
I'm just going to test some non sense uses of <>
to see what happens.
Like this one that doesn't change the output or show error:
Expr <- _ Sum EOF
Sum <- Product (PLUS_MINUS Product)*
Product <- Value (MUL_DIV Value)*
Value <- NUMBER / ParenExpr
ParenExpr <- LPAREN Sum RPAREN
PLUS_MINUS <- << [+-] > _
MUL_DIV <- < [*/] > _
NUMBER <- < ("0" / [1-9][0-9]*) > _
LPAREN <- < '(' > _
RPAREN <- < ')' > _
_ {I} <- [ \t\r\n]*
EOF <- !.
Output:
./chpeg -v -g calc-trim2.chpeg -P ' 1 + 2 * 3 / 4 '
Compiling grammar file: 'calc-trim2.chpeg'
Grammar file 'calc-trim2.chpeg' compiled successfully. Parser returned: 0
Parsing string: " 1 + 2 * 3 / 4 "
Parse successful.
offset len id dp flg ident "data"
0 20 0 0 --- Expr " 1 + 2 * 3 / 4 "
2 18 1 1 --- Sum "1 + 2 * 3 / 4 "
2 1 7 2 --- NUMBER "1"
5 1 5 2 --- PLUS_MINUS "+"
7 13 2 2 --- Product "2 * 3 / 4 "
7 1 7 3 --- NUMBER "2"
11 1 6 3 --- MUL_DIV "*"
13 1 7 3 --- NUMBER "3"
15 1 6 3 --- MUL_DIV "/"
17 1 7 3 --- NUMBER "4"
20 0 11 1 --- EOF ""
Or this one no error message:
Expr <- _ Sum EOF
Sum <- Product (PLUS_MINUS Product)*
Product <- Value (MUL_DIV Value)*
Value <- NUMBER / ParenExpr
ParenExpr <- LPAREN Sum RPAREN
PLUS_MINUS <- << [+-] _
MUL_DIV <- [*/] >> _
NUMBER <- < ("0" / [1-9][0-9]*) > _
LPAREN <- < '(' > _
RPAREN <- < ')' > _
_ {I} <- [ \t\r\n]*
EOF <- !.
Output:
./chpeg -v -g calc-trim3.chpeg -P ' 1 + 2 * 3 / 4 '
Compiling grammar file: 'calc-trim3.chpeg'
Grammar file 'calc-trim3.chpeg' compiled successfully. Parser returned: 0
Parsing string: " 1 + 2 * 3 / 4 "
Parse successful.
offset len id dp flg ident "data"
0 20 0 0 --- Expr " 1 + 2 * 3 / 4 "
2 18 1 1 --- Sum "1 + 2 * 3 / 4 "
2 1 7 2 --- NUMBER "1"
5 2 5 2 --- PLUS_MINUS "+ "
7 13 2 2 --- Product "2 * 3 / 4 "
7 1 7 3 --- NUMBER "2"
11 1 6 3 --- MUL_DIV "*"
13 1 7 3 --- NUMBER "3"
15 1 6 3 --- MUL_DIV "/"
17 1 7 3 --- NUMBER "4"
20 0 11 1 --- EOF ""
And probably in future we'll need named capture as well as complement for back reference.
At least it serves as an example of how to expand the VM
.
Also notice CPP-PEGLIB
uses <>
as an alias to grouping ()
as group capturing.
Yes it works with the string case as well:
Start <- _ String* EOF
String <- "'" < (!"'" .)* > "'" _
_ {I} <- [ \t\r\n]*
EOF <- !.
Output:
./chpeg -v -g string-trim.chpeg -P " 'dad' "
Compiling grammar file: 'string-trim.chpeg'
Grammar file 'string-trim.chpeg' compiled successfully. Parser returned: 0
Parsing string: " 'dad' "
Parse successful.
offset len id dp flg ident "data"
0 7 0 0 --- Start " 'dad' "
2 3 1 1 --- String "dad"
7 0 3 1 --- EOF ""
CPP-PEGLIB calls it < ... > (Token boundary operator)
It allows multiple in a rule for choices (/ separated). If you use two in sequence only the first one 'captures'
Try this on https://yhirose.github.io/cpp-peglib/ with input " 1 + Ab * 0 / 4 "
Expr <- _ Sum EOF
Sum <- Product (PLUS_MINUS Product)*
Product <- Value (MUL_DIV Value)*
Value <- NUMBER / IDENT / ParenExpr
ParenExpr <- LPAREN Sum RPAREN
IDENT <- <[A-Z]+> <[a-z]+> _
PLUS_MINUS <- < [+-] > _
MUL_DIV <- < [*/] > _
NUMBER <- ( <"0"> / <[1-9][0-9]*>) _
LPAREN <- < '(' > _
RPAREN <- < ')' > _
~_ <- [ \t\r\n]*
EOF <- !.
Notice it only captures the A on the IDENT. It captures both NUMBER cases.
+ Expr
+ Sum
- Product/0[NUMBER] (1)
- PLUS_MINUS (+)
+ Product
- Value/1[IDENT] (A)
- MUL_DIV (*)
- Value/0[NUMBER] (0)
- MUL_DIV (/)
- Value/0[NUMBER] (4)
- EOF ()
I think I could make the trim idea work with some tweaking, probably enforcing a matched ordered < >. I like the trim idea because of its efficiency - it doesn't involved creating and later destroying nodes like true capturing probably would. Thinking of it as token boundary is good rather than capturing. I think for back references some other capturing/marking would be required.
Here's a better implementation: https://github.com/ChrisHixon/chpeg/commit/433e537c6d9ea0be25d0f4b557a9a0915bc7084d See comments in commit.
This and some fixes are merged into https://github.com/ChrisHixon/chpeg/tree/refactor now
See if you can break it or find flaws!
Trying to parse a grammar for https://github.com/pest-parser/pest I've got a cryptic error when trying to parse this:
Identifier {L} <-
!PUSH <IdentStart IdentCont*> Spacing
But then converting as shown bellow parsed OK:
Identifier {L} <-
<!PUSH IdentStart IdentCont*> Spacing
Full working grammar:
Grammar <-
Spacing Definition+ EndOfFile
Spacing {I} <-
( Space / Comment )*
Definition <-
Identifier EQUAL modifier? opening_brace Expression closing_brace
EndOfFile <-
! .
Space <-
" "
/ "\t"
/ EndOfLine
Comment <-
( "//" ( ! EndOfLine . )* EndOfLine? )
/ ( "/*" ( ! '*/' . )* "*/" )
Identifier {L} <-
<!PUSH IdentStart IdentCont*> Spacing
EQUAL <-
"=" Spacing
modifier <-
silent_modifier
/ atomic_modifier
/ compound_atomic_modifier
/ non_atomic_modifier
opening_brace <-
'{' Spacing
Expression <-
Sequence ( SLASH Sequence )*
closing_brace <-
'}' Spacing
PUSH <-
"PUSH" Spacing
IdentStart <-
[a-zA-Z_]
IdentCont <-
IdentStart
/ Digit
Sequence <-
Prefix ( TILDE Prefix )*
SLASH <-
"|" Spacing
Prefix <-
( AND / NOT )? Suffix
TILDE <-
"~" Spacing
AND <-
"&" Spacing
NOT <-
"!" Spacing
Suffix <-
Primary ( QUESTION / STAR / PLUS / repeat_exact / repeat_min / repeat_max / repeat_min_max )?
Primary <-
( Identifier COLON Prefix ! ( Literal? EQUAL ) )
/ ( Identifier ! ( Literal? EQUAL ) )
/ ( PUSH? OPEN Expression CLOSE )
/ Range
/ ( insensitive_operator? Literal )
/ DOT
/ peek_slice
QUESTION <-
"?" Spacing
STAR <-
"*" Spacing
PLUS <-
"+" Spacing
repeat_exact <-
opening_brace number closing_brace
repeat_min <-
opening_brace number comma closing_brace
repeat_max <-
opening_brace comma number closing_brace
repeat_min_max <-
opening_brace number comma number closing_brace
COLON <-
":" Spacing
Literal <-
LiteralSQ
/ LiteralDQ
/ LiteralBQ
LiteralSQ {L} <-
['] <( ! ['] Char )*> ['] Spacing
LiteralDQ {L} <-
["] <( ! ["] Char )*> ["] Spacing
LiteralBQ {L} <-
[`] <( ! [`] Char )*> [`] Spacing
OPEN <-
"(" Spacing
CLOSE <-
")" Spacing
Range <-
Literal range_operator Literal
insensitive_operator <-
"^"
DOT <-
"ANY" Spacing
peek_slice <-
PEEK opening_brack integer? range_operator integer? closing_brack
Char <-
( "\\" [abefnrtv'"`\[\]\\] )
/ ( "\\" [0-3] OctalDigit OctalDigit )
/ ( "\\" OctalDigit OctalDigit? )
/ ( "\\x" HexDigit HexDigit )
/ ( "\\u" opening_brace HexDigit HexDigit HexDigit HexDigit closing_brace )
/ ( "\\" "-" )
/ ( ! "\\" . )
Digit <-
[0-9]
OctalDigit <-
[0-7]
HexDigit <-
[0-9a-fA-F]
range_operator <-
".." Spacing
EndOfLine <-
"\r\n"
/ "\n\r"
/ "\n"
/ "\r"
silent_modifier <-
"_" Spacing
atomic_modifier <-
"@" Spacing
compound_atomic_modifier <-
"$" Spacing
non_atomic_modifier <-
"!" Spacing
comma <-
',' Spacing
number <-
Digit+
PEEK <-
"PEEK" Spacing
opening_brack <-
'[' Spacing
closing_brack <-
']' Spacing
integer <-
number
/ ( "-" "0"* '1' . . '9' number? )
Closing. I think everything here has been addressed.
I'd like to eliminate the Ruby bootstrap process, and refactor/reorganize some.
Various ideas: