arithy / packcc

A parser generator for C
Other
348 stars 29 forks source link

Strange bad generated parser #60

Closed mingodad closed 2 years ago

mingodad commented 2 years ago

When trying to build a parser for lpeg-re grammar to use with packcc, packcc accepts the grammar without any error and generate the parser but when trying to compile the parser there is the error shown bellow, it seems to have trouble with the S rule in the grammar.

gcc -o lpeg-re lpeg-re.c
lpeg-re.c: In function ‘pcc_evaluate_rule_suffix’:
lpeg-re.c:1286:25: error: expected expression before ‘)’ token
 1286 |                         )) goto L0005;
      |                         ^

The grammar:

%prefix "lpeg_re"

pattern         <- exp !.
exp             <- S (grammar / alternative)

alternative     <- seq ('/' S seq)*
seq             <- prefix*
prefix          <- '&' S prefix / '!' S prefix / suffix
suffix          <- primary S (([+*?]
                            / '^' [+-]? num
                            / '->' S (string / '{}' / name / num)
                            / '=>' S name) S)*

primary         <- '(' exp ')' / (string / keyword) / class / defined
                 / '{:' (name ':')? exp ':}'
                 / '=' name
                 / '@' exp
                 / '{}'
                 / '{~' exp '~}'
                 / '{|' exp '|}'   # missing table capture
                 / '{' exp '}'
         / '~?' # Expected match
         / '~>' S ( 'foldleft' / 'foldright' / 'rfoldleft' / 'rfoldright' )
         / '$' (string / name / num) # Arbitrary capture
                 / '.'
                 / name S !(asttag / arrow )
                 / '<' name '>'          ## old-style non terminals
         / '^' name
         / '%{' S name S '}'

grammar         <- definition+
definition      <- name S (asttag S)? arrow exp

class           <- '[' '^'? item (!']' item)* ']'
item            <- defined / range / .
range           <- . '-' [^\]]

S               <- ([ \t\f\r\n]  /  '--' [^\r\n]*)*  # spaces and comments
name            <- [A-Za-z_]([A-Za-z0-9_] / '-' !'>' )*
arrow           <- (  '<--' / '<==' / '<-|'  / '<-' )
num             <- [0-9]+
string          <- '"' [^"]* '"' / "'" [^']* "'"
defined         <- '%' name
keyword     <-  '`' [^`]+ '`'
asttag         <- ':' S name

%%
int main() {
    lpeg_re_context_t *ctx = lpeg_re_create(NULL);
    while (lpeg_re_parse(ctx, NULL));
    lpeg_re_destroy(ctx);
    return 0;
}
mingodad commented 2 years ago

Testing with this grammar that only manages spaces and line comments the generated parser compiles fine:

start <- S EOF
S <- ([ \t\f\r\n]  / '--' [^\r\n]*)*
EOF <- !.

%%
int main() {
    pcc_context_t *ctx = pcc_create(NULL);
    while (pcc_parse(ctx, NULL));
    pcc_destroy(ctx);
    return 0;
}
mingodad commented 2 years ago

Comparing the fault generated parser with the good one I can see that there is 2 lines missing, when I add then manually the parser compiles fine:

            L0003:;
                ctx->cur = p;
                pcc_thunk_array__revert(ctx->auxil, &chunk->thunks, n);
                if (
                    pcc_refill_buffer(ctx, 1) < 1 ||
                    ctx->buffer.buf[ctx->cur] != '^'
                ) goto L0004;
                ctx->cur++;
                {
                    const size_t p = ctx->cur;
                    const size_t n = chunk->thunks.len;
                    {
                        int u;
                        const size_t n = pcc_get_char_as_utf32(ctx, &u);
                        if (n == 0) goto L0005;
                        if (!(
                u == 0x00000d ||  ///!!!!<<< missing
                                u == 0x00000a    ///!!!!<<< missing
                        )) goto L0005;
                        ctx->cur += n;
                    }
                    goto L0006;
                L0005:;
mingodad commented 2 years ago

I've added some printfs on generate_matching_utf8_charclass_code function and found the problematic charset [+-]:

                            / '^' [+-]? num

My fix so far with the existing tests passing except for (skipped: uncrustify is too old (minimal required version is 0.72.0):

diff --git a/src/packcc.c b/src/packcc.c
index 9d32252..81c30c2 100644
--- a/src/packcc.c
+++ b/src/packcc.c
@@ -2786,7 +2786,9 @@ static code_reach_t generate_matching_utf8_charclass_code(generate_t *gen, const
                     u0 = 0;
                     r = FALSE;
                 }
-                else if (value[i] != '-') { /* single character */
+                else if ((value[i] != '-')
+                        || (i == n-1) /* the individual '-' char is valid when first or last*/
+                        ) { /* single character */
                     stream__write_characters(gen->stream, ' ', indent + 4);
                     stream__printf(gen->stream, "u == 0x%06x%s\n", u, (i < n) ? " ||" : "");
                     u0 = 0;
mingodad commented 2 years ago

I've updated the test to include this case. The output of tests/test.sh before my fix:

./test.sh
 ✓ Testing ascii.d - generation
...
 ✗ Testing character_classes_0.d - compilation
   (from function `test_compile' in file utils.sh, line 6,
    in test file character_classes_0.d/test.bats, line 10)
     `test_compile' failed
   In file included from main.c:12:
   /home/mingo/dev/c/A_grammars/packcc/tests/character_classes_0.d/parser.c: In function ‘pcc_evaluate_rule_CLASS11’:
   /home/mingo/dev/c/A_grammars/packcc/tests/character_classes_0.d/parser.c:1652:9: error: expected expression before ‘)’ token
    1652 |         )) goto L0000;
         |         ^
   /home/mingo/dev/c/A_grammars/packcc/tests/character_classes_0.d/parser.c: In function ‘pcc_evaluate_rule_CLASS13’:
   /home/mingo/dev/c/A_grammars/packcc/tests/character_classes_0.d/parser.c:1726:9: error: expected expression before ‘)’ token
    1726 |         ) goto L0000;
         |         ^
 ✗ Testing character_classes_0.d - run
   (from function `check_output' in file utils.sh, line 10,
    from function `run_for_input' in file utils.sh, line 15,
    in test file character_classes_0.d/test.bats, line 13)
     `run_for_input "character_classes_0.d/input.txt"' failed
   --- character_classes_0.d/expected.txt
   +++ output
   @@ -1,14 +1 @@
   -CLASS10: 'a+'
   -CLASS10: 'a-'
   -CLASS11: 'b+'
   -CLASS11: 'b-'
   -CLASS12: 'cc'
   -CLASS13: 'dd'
   -CLASS0: '^'
   -CLASS0: '-'
   -CLASS1: '\'
   -CLASS1: ']'
   -CLASS2: 'ぬ'
   -CLASS3: '𝓴'
   -CLASS4: 'J'
   -CLASS5: '7'
   +timeout: failed to run command ‘/home/mingo/dev/c/A_grammars/packcc/tests/character_classes_0.d/parser’: No such file or directory
 ✓ Testing character_classes_1.d - generation
...
70 tests, 2 failures, 2 skipped

And after my fix all tests pass again see my full changes bellow:

diff --git a/src/packcc.c b/src/packcc.c
index 9d32252..81c30c2 100644
--- a/src/packcc.c
+++ b/src/packcc.c
@@ -2786,7 +2786,9 @@ static code_reach_t generate_matching_utf8_charclass_code(generate_t *gen, const
                     u0 = 0;
                     r = FALSE;
                 }
-                else if (value[i] != '-') { /* single character */
+                else if ((value[i] != '-')
+                        || (i == n-1) /* the individual '-' char is valid when first or last*/
+                        ) { /* single character */
                     stream__write_characters(gen->stream, ' ', indent + 4);
                     stream__printf(gen->stream, "u == 0x%06x%s\n", u, (i < n) ? " ||" : "");
                     u0 = 0;
diff --git a/tests/character_classes_0.d/expected.txt b/tests/character_classes_0.d/expected.txt
index cab35b6..cbf314e 100644
--- a/tests/character_classes_0.d/expected.txt
+++ b/tests/character_classes_0.d/expected.txt
@@ -1,3 +1,9 @@
+CLASS10: 'a+'
+CLASS10: 'a-'
+CLASS11: 'b+'
+CLASS11: 'b-'
+CLASS12: 'cc'
+CLASS13: 'dd'
 CLASS0: '^'
 CLASS0: '-'
 CLASS1: '\'
diff --git a/tests/character_classes_0.d/input.peg b/tests/character_classes_0.d/input.peg
index 8633e0b..a98f93a 100644
--- a/tests/character_classes_0.d/input.peg
+++ b/tests/character_classes_0.d/input.peg
@@ -1,4 +1,6 @@
-FILE <- TOKEN (_ TOKEN)* _*
+FILE <- 'test0' _ TOKEN1 (_ TOKEN1)* _*
+            /  'test1' _ TOKEN (_ TOKEN)* _*
+
 TOKEN <- CLASS0 / CLASS1 / CLASS2 / CLASS3 / CLASS4 / CLASS5
 CLASS0 <- [\^a-z-] { printf("CLASS0: '%s'\n", $0); }
 CLASS1 <- [\\\]]   { printf("CLASS1: '%s'\n", $0); }
@@ -6,4 +8,11 @@ CLASS2 <- [-あ-ん] { printf("CLASS2: '%s'\n", $0); }
 CLASS3 <- [𝓪-𝔃-]   { printf("CLASS3: '%s'\n", $0); }
 CLASS4 <- [^0-9]   { printf("CLASS4: '%s'\n", $0); }
 CLASS5 <- [^]      { printf("CLASS5: '%s'\n", $0); }
+
+TOKEN1 <- CLASS10 / CLASS11 / CLASS12 / CLASS13
+CLASS10 <- 'a'[-+] { printf("CLASS10: '%s'\n", $0); }
+CLASS11 <- 'b'[+-]   { printf("CLASS11: '%s'\n", $0); }
+CLASS12 <- 'c'[^-+] { printf("CLASS12: '%s'\n", $0); }
+CLASS13 <- 'd'[^+-]   { printf("CLASS13: '%s'\n", $0); }
+
 _ <- ' ' / '\n'
diff --git a/tests/character_classes_0.d/input.txt b/tests/character_classes_0.d/input.txt
index 4a92f45..5dd7427 100644
--- a/tests/character_classes_0.d/input.txt
+++ b/tests/character_classes_0.d/input.txt
@@ -1 +1,2 @@
-^ - \ ] ぬ 𝓴 J 7
+test0 a+ a- b+ b- cc dd
+test1 ^ - \ ] ぬ 𝓴 J 7
arithy commented 2 years ago

I overlooked such a case. Thanks for your report and investigation.