skvadrik / re2c

Lexer generator for C, C++, Go and Rust.
https://re2c.org
Other
1.09k stars 170 forks source link

Tag positions are unexpectedly shifted in buffer refill mode #341

Closed scossu closed 3 years ago

scossu commented 3 years ago

I am following the example in http://re2c.org/manual/manual_c.html#yyfill-with-sentinel-character to build a streaming scanner with an indefinitely refillable buffer. I am also using s-tags to match sub-captures.

This is an example program that reproduces the issue, slightly modified from the example in the manual. Note the buffer is very small to more easily trigger a refill, but larger than any single token in the input.

#include <assert.h>
#include <errno.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define YYCTYPE     unsigned char
#define YYCURSOR    in->cur
#define YYMARKER    in->mar
#define YYLIMIT     in->lim
#define YYFILL      fill(in) == 0

/**
 * Max chunk size passed to scanner at each iteration.
 */
#define SIZE 64

#define T_EOF       0
#define T_ALNUM     1
#define T_NUMBER    2

typedef struct {
    FILE *      file;               // Input file handle.
    char        buf[SIZE + 1],      // Start of buffer.
         *      lim,                // Position after the last
                                    //   available input character
                                    //   (YYLIMIT)
         *      cur,                // Next input character to be read
                                    //   (YYCURSOR)
         *      mar,                // Most recent match (YYMARKER)
         *      tok;                // Start of current token.
    int         eof;                // if we have reached EOF (T|F)
} Input;

static int fill(Input *in)
{
    if (in->eof) {
        return 1;
    }
    const size_t free = in->tok - in->buf;
    if (free < 1) {
        return 2;
    }
    printf ("Shift %lu bytes.\n", free);
    memmove(in->buf, in->tok, in->lim - in->tok);
    in->lim -= free;
    in->cur -= free;
    in->mar -= free;
    in->tok -= free;
    in->lim += fread(in->lim, 1, free, in->file);
    in->lim[0] = 0;
    in->eof |= in->lim < in->buf + SIZE;
    return 0;
}

static void init(Input *in, FILE *file)
{
    in->file = file;
    in->cur = in->mar = in->tok = in->lim = in->buf + SIZE;
    in->eof = 0;
    fill (in);
}

static int lex (Input *in)
{
    const char *as, *ns, *ne;

    in->tok = in->cur;
    /*!stags:re2c format = 'const char *@@;'; */

    /*!re2c
    re2c:eof = 0;
    re2c:flags:8 = 1;
    re2c:flags:tags = 1;
    re2c:api:style = functions;
    re2c:define:YYFILL:naked = 1;

    NUMBER = [0-9]+;
    ALNUM = @as [a-z]* @ns NUMBER @ne [a-z]*;

    $       {
        printf("End of buffer.\n");
        return T_EOF;
    }

    ALNUM   {
        printf ("ns - as pos: %ld\n", (long) ns - (long) as);
        printf ("ne - ns pos: %ld\n", (long) ne - (long) ns);
        //printf ("as: %s\n", as);
        //printf ("ns: %s\n", ns);
        return T_ALNUM;
    }

    *       { return -1; }

    */
}

int main()
{
    const char *fname = "input";
    const char str[] = (
            "ceilvue3856slig seuneslriguner1234seor rgtthdkp5678 "
            "fweorrifj4568epjossmkawergm 9277fkeepsokrgpseok dshagj7690 "
            "vnbgsep7800 vvjkksppsn0037qa nd1456dekpsmsjghs fjjjekjh7734"
            );
    FILE *f;
    Input in;

    f = fopen(fname, "w");
    fwrite(str, 1, sizeof(str) - 1, f);
    fclose(f);

    f = fopen(fname, "r");
    init(&in, f);

    int rc = 0;
    do {
        printf ("Token: %d\n", rc);
    } while ((rc = lex(&in)) != T_EOF );

    fclose(f);

    remove(fname);
    return 0;
}

Compiling and running yields this output:

$ re2c test.re -o test.c -T --case-ranges 
$ gcc -Wall -g3 test.c -o test.o
$ ./test.o Shift 64 bytes.
Shift 64 bytes.
Token: 0
ns - as pos: 7
ne - ns pos: 4
Token: 1
Token: -1
ns - as pos: 14
ne - ns pos: 4
Token: 1
Token: -1
ns - as pos: 8
ne - ns pos: 4
Token: 1
Token: -1
Shift 52 bytes.
ns - as pos: 9
ne - ns pos: -48
Token: 1
Token: -1
ns - as pos: 0
ne - ns pos: 4
Token: 1
Token: -1
ns - as pos: 6
ne - ns pos: 4
Token: 1
Token: -1
Shift 59 bytes.
ns - as pos: -52
ne - ns pos: 4
Token: 1
Token: -1
ns - as pos: 10
ne - ns pos: 4
Token: 1
Token: -1
ns - as pos: 2
ne - ns pos: 4
Token: 1
Token: -1
ns - as pos: 8
ne - ns pos: 4
Token: 1
End of buffer.

Note that as the buffer is refilled, some tags have a lower position than expected (note the negative numbers); precisely, they are off by the shift amount. But that only seem to happen to the tags that would have been off range before the shift.

Am I using the s-tags incorrectly, or is it an implementation issue?

Thanks for the help.

skvadrik commented 3 years ago

Hi @scossu!

The tags are not shifted in your example because they are not part of the Input struct, and not handled by YYFILL. Other pointers (lim, cur, mar, tok) are shifted by YYFILL, but not the tag variables (those generated with /*!stags:re2c ... */). To fix this, apply the following diff:

--- test.re.orig        2021-01-11 07:17:54.414214948 +0000
+++ test.re     2021-01-11 07:17:23.277213695 +0000
@@ -32,6 +32,7 @@
                                     //   (YYCURSOR)
          *      mar,                // Most recent match (YYMARKER)
          *      tok;                // Start of current token.
+    /*!stags:re2c format = "char *@@;"; */
     int         eof;                // if we have reached EOF (T|F)
 } Input;

@@ -51,6 +52,7 @@
     in->cur -= free;
     in->mar -= free;
     in->tok -= free;
+    /*!stags:re2c format = "in->@@ -= free;"; */
     in->lim += fread(in->lim, 1, free, in->file);
     in->lim[0] = 0;
     in->eof |= in->lim < in->buf + SIZE;
@@ -72,12 +74,12 @@
     const char *as, *ns, *ne;

     in->tok = in->cur;
-    /*!stags:re2c format = 'const char *@@;'; */

     /*!re2c
     re2c:eof = 0;
     re2c:flags:8 = 1;
     re2c:flags:tags = 1;
+    re2c:tags:expression = "in->@@";
     re2c:api:style = functions;
     re2c:define:YYFILL:naked = 1;

It does three things:

After applying the patch, I have the following output:

$ re2c test.re -o test.c -T --case-ranges && gcc -Wall -g3 test.c -o test && ./test
Shift 64 bytes.
Token: 0
ns - as pos: 7
ne - ns pos: 4
Token: 1
Token: -1
ns - as pos: 14
ne - ns pos: 4
Token: 1
Token: -1
ns - as pos: 8
ne - ns pos: 4
Token: 1
Token: -1
Shift 52 bytes.
ns - as pos: 9
ne - ns pos: 4
Token: 1
Token: -1
ns - as pos: 0
ne - ns pos: 4
Token: 1
Token: -1
ns - as pos: 6
ne - ns pos: 4
Token: 1
Token: -1
Shift 59 bytes.
ns - as pos: 7
ne - ns pos: 4
Token: 1
Token: -1
ns - as pos: 10
ne - ns pos: 4
Token: 1
Token: -1
ns - as pos: 2
ne - ns pos: 4
Token: 1
Token: -1
ns - as pos: 8
ne - ns pos: 4
Token: 1
End of buffer.
skvadrik commented 3 years ago

And thank you for raising this issue, I now realize there is a gap in re2c documentation for tags, as none of the submatch extraction examples use YYFILL, so it's hard to guess how tag variables should be handled by it.

There are some examples that use tags with YYFILL, but they are well hidden: https://re2c.org/examples/c/submatch/example_uri_rfc3986.html

Also note that if some of the tags are nested inside of alternative or repetition subexpression, and can be NULL, then /*!stags:re2c format = "if (in->@@) in->@@ -= free;"; */ should be used in YYFILL instead of /*!stags:re2c format = "in->@@ -= free;"; */, otherwise the NULL-valued tags would be shifted as well, which is not the right thing to do. You may want to insert the check anyway just to be on the safe side, if in the future you add some nested tags. It should not be a performance problem unless YYFILL is called too often (which shouldn't be the case with a reasonably sized buffer).

scossu commented 3 years ago

@skvadrik Thanks very much for the quick reply. I will follow your example.

And yes, an additional note in the documentation would be very helpful. I will leave the issue open for you in case you want to attach a PR to it. Thanks.

scossu commented 3 years ago

Thanks, this works now. By the way, it may be worth adding

/*!stags:re2c format = "in->@@ = NULL; "; */

in the init() function, otherwise Valgrind will complain with "Conditional jump or move depends on uninitialised value(s)" in the first call to fill().

skvadrik commented 3 years ago

True, thanks!

skvadrik commented 3 years ago

To clarify, you can initialize them to any value. It is needed only because YYFILL may use them before they are set (so, only to get rid of Valgrind's warning). Aside from that initialization is not needed, as all s-tags of a given rule are either set to a valid input position, or NULL by the time semantic action is executed.

skvadrik commented 3 years ago

I've updated the docs and added an example with YYFILL: https://re2c.org/manual/manual_c.html#submatch-extraction.

Closing the bug.