skvadrik / re2c

Lexer generator for C, C++, Go and Rust.
https://re2c.org
Other
1.07k stars 169 forks source link

mtag / stag automatic variable generation collision #385

Closed cyanogilvie closed 2 years ago

cyanogilvie commented 2 years ago

I'm trying to parse HTTP messages with a mix of conditions, stags and mtags, and I seem to be hitting an issue where the generated yyt* variables for the stag and mtag values collide, so:

    /*!stags:re2c format = "\tunsigned char*\t\t@@;\n"; */
    /*!mtags:re2c format = "\tstruct mtag\t\t\t\t*@@;\n"; */

produces:

#line 44 "mtagbug.c"
    unsigned char*      yyt1;
    unsigned char*      yyt2;
#line 34 "mtagbug.re"

#line 50 "mtagbug.c"
    struct mtag             *yyt1;
    struct mtag             *yyt10;
    struct mtag             *yyt2;
    struct mtag             *yyt3;
    struct mtag             *yyt4;
    struct mtag             *yyt5;
    struct mtag             *yyt6;
    struct mtag             *yyt7;
    struct mtag             *yyt8;
    struct mtag             *yyt9;
#line 35 "mtagbug.re"

leading to compile errors like:

mtagbug.c:50:18: error: duplicate member ‘yyt1’
   50 |  struct mtag    *yyt1;
      |                  ^~~~
mtagbug.c:52:18: error: duplicate member ‘yyt2’
   52 |  struct mtag    *yyt2;
      |                  ^~~~

The issue seems to be related to conditions - if the condition <header> is changed to <media_type> in the following, the collision does not occur:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/*!types:re2c*/

struct mtag {
    struct mtag*    prev;
    int             dist;
};

struct mtagpool {
    struct mtag*        head;
    struct mtag*        next;
    struct mtag*        last;
};

enum con_status {
    CON_STATUS_WAITING,
    CON_STATUS_DONE,
    CON_STATUS_ERROR
};

#define CON_STATE_SIZE      (4096-32)
struct con_state {
    unsigned char*      cur;
    unsigned char*      mar;
    unsigned char*      tok;
    unsigned char*      lim;
    int                 cond;
    int                 state;
    struct mtagpool     mtp;
    /*!stags:re2c format = "\tunsigned char*\t\t@@;\n"; */
    /*!mtags:re2c format = "\tstruct mtag\t\t\t\t*@@;\n"; */
    size_t              buf_size;
    unsigned char*      buf;
    unsigned char       static_buf[];
};
#define CON_READ_BUF_LEN    (CON_STATE_SIZE - sizeof(struct con_state) - 1) // -1: ensure a sentinel at the end of buf

static void mtagpool_init(struct mtagpool* mtp) //<<<
{
    static const unsigned   size = 1024 * 1024;

    mtp->head = (struct mtag*)malloc(size * sizeof(struct mtag));
    mtp->next = mtp->head;
    mtp->last = mtp->head + size;
}

static void mtagpool_free(struct mtagpool* mtp)
{
    free(mtp->head);
    mtp->head = mtp->next = mtp->last = NULL;
}

static struct mtag* mtagpool_next(struct mtagpool* mtp)
{
    unsigned        size;
    struct mtag*    head;

    if (mtp->next < mtp->last) return mtp->next++;

    size = mtp->last - mtp->head;
    head = (struct mtag*)malloc(2 * size * sizeof(struct mtag));
    memcpy(head, mtp->head, size * sizeof(struct mtag));
    free(mtp->head);
    mtp->head = head;
    mtp->next = head + size;
    mtp->last = head + size * 2;

    return mtp->next++;
}

static void mtag(struct mtag** pmt, const unsigned char* b, const unsigned char* t, struct mtagpool* mtp)
{
    struct mtag*    mt = mtagpool_next(mtp);
    mt->prev = *pmt;
    mt->dist = t - b;
    *pmt = mt;
}

static enum con_status parse_con_req(struct con_state* c)
{
    unsigned int        yych, yyaccept;
    const unsigned char *l1, *l2;
    struct mtag         *f1, *f2, *p1, *p2, *p3, *p4;

    /*!getstate:re2c*/
    /*!re2c
        re2c:api:style                  = free-form;
        re2c:eof                        = 0;
        re2c:flags:tags                 = 1;
        re2c:tags:expression            = "c->@@";
        re2c:define:YYCTYPE             = "unsigned char";
        re2c:define:YYCURSOR            = "c->cur";
        re2c:define:YYMARKER            = "c->mar";
        re2c:define:YYLIMIT             = "c->lim";
        re2c:define:YYGETSTATE          = "c->state";
        re2c:define:YYSETSTATE          = "c->state = @@;";
        re2c:define:YYFILL              = "return CON_STATUS_WAITING;";
        re2c:define:YYGETCONDITION      = "c->cond";
        re2c:define:YYSETCONDITION      = "c->cond = @@;";
        re2c:define:YYMTAGP             = "mtag(&@@, c->tok, c->cur, &c->mtp);";
        re2c:define:YYMTAGN             = "mtag(&@@, c->tok, NULL, &c->mtp);";

        crlf        = '\r\n';
        sp          = ' ';
        htab        = '\t';
        ows         = (sp | htab)*;
        digit       = [0-9];
        alpha       = [a-zA-Z];
        vchar       = [\x1f-\x7e];
        tchar       = [-!#$%&'*+.^_`|~] | digit | alpha;

        obs_fold                = #f1 crlf (sp | htab)+ #f2;
        obs_text                = [\x80-\xff];
        field_name              = tchar+;
        field_vchar             = vchar | obs_text;
        field_content           = field_vchar ((sp | htab)+ field_vchar)?;
        field_value_folded      = (field_content* obs_fold field_content*)+;
        header_field_folded     = field_value_folded ows;
        token                   = tchar+;
        qdtext
            = htab
            | sp
            | [\x21-\x5B\x5D-\x7E] \ '"'
            | obs_text;
        quoted_pair         = '\\' ( htab | sp | vchar | obs_text );
        quoted_string       = '"' ( qdtext | quoted_pair )* '"';
        parameter           = #p1 token #p2 '=' #p3 ( token | quoted_string ) #p4;
        media_type          = @l1 token '/' token @l2 ( ows ';' ows parameter )*;

        <media_type> media_type ows crlf    {
            struct mtag*    pname_start = p1;
            struct mtag*    pname_end   = p2;
            struct mtag*    pval_start  = p3;
            struct mtag*    pval_end    = p4;

            printf("media type: %.*s\n", (int)(l2-l1), l1);

            while (pname_start) {
                printf("\t(%.*s) = (%.*s)\n",
                    pname_end->dist - pname_start->dist, c->tok + pname_start->dist,
                    pval_end->dist - pval_start->dist, c->tok + pval_start->dist);

                pname_start = pname_start->prev;
                pname_end = pname_end->prev;
                pval_start = pval_start->prev;
                pval_end = pval_end->prev;
            }

            return CON_STATUS_DONE;
        }

        <header> header_field_folded crlf   {
            struct mtag*    fold_start  = f1;
            struct mtag*    fold_end    = f2;

            while (fold_start) {
                memset(c->tok + fold_start->dist, ' ', fold_end->dist - fold_start->dist);
                fold_start  = fold_start->prev;
                fold_end    = fold_end->prev;
            }

            return CON_STATUS_DONE;
        }

        <*> $           { return CON_STATUS_ERROR; }
        <*> *           { return CON_STATUS_ERROR; }
    */
}

int feed(struct con_state* c, const unsigned char* chunk, int len)
{
    const size_t        shift = c->tok - c->buf;
    const size_t        free = c->buf_size - (c->lim - c->tok);

    if (free < len) {
        fprintf(stderr, "Token too long for receive buffer: %ld\n", c->buf_size);
        return 1;
    }

    if (shift) {
        memmove(c->buf, c->tok, c->buf_size - shift);
        c->lim -= shift;
        c->cur -= shift;
        c->mar -= shift;
        c->tok -= shift;
        /*!stags:re2c format = "\t\t\tif (c->@@) c->@@ -= shift;\n"; */
    }

    memcpy(c->lim, chunk, len);

    c->lim += len;
    c->lim[0] = 0;  // Append sentinel

    return 0;
}

int main(int argc, char** argv)
{
    int                 rc = 0;
    struct con_state*   c = NULL;
    int                 i;
    static const char*  chunks[] = {
        "ap",
        "plication/j",
        "son;",
        " charset=\"",
        "utf\\\"-8\"\r",
        "\n",
        "",
        NULL
    };

    c = malloc(CON_STATE_SIZE);
    c->buf = c->static_buf;
    c->cur = c->mar = c->tok = c->lim = c->buf + CON_READ_BUF_LEN;
    c->lim[0]           = 0;    // sentinel
    c->state            = -1;
    c->buf_size         = CON_READ_BUF_LEN;
    /*!stags:re2c format = "\tc->@@ = 0;\n"; */
    /*!mtags:re2c format = "\tc->@@ = NULL;\n"; */
    mtagpool_init(&c->mtp);

    for (i=0; chunks[i]; i++) {
        rc = feed(c, (const unsigned char*)chunks[i], strlen(chunks[i]));
        if (rc) goto finally;

        switch (parse_con_req(c)) {
            case CON_STATUS_WAITING:
                printf("waiting\n");
                continue;
            case CON_STATUS_DONE:
                printf("done\n");
                break;
            default:
                rc = 1;
                goto finally;
        }
    }

finally:
    if (c) {
        mtagpool_free(&c->mtp);
        free(c);
        c = NULL;
    }
    if (rc) fprintf(stderr, "Error exit: %d\n", rc);
    return rc;
}

I'm compiling with the switches --conditions --case-ranges -W -Wno-nondeterministic-tags --storable-state using the latest version from the master branch: 68e1ab716

I'm new to using mtags, so I'm not sure if my mental model is just wrong, or I'm hitting a case that others haven't before (mixing conditions, stags and mtags)?

skvadrik commented 2 years ago

Thanks for a complete example. Indeed re2c generates identically named tag variables for different conditions, and this causes collision if one of the variables is for an s-tag and the other one for an m-tag. You are the first to hit this issue!

I pushed a fix https://github.com/skvadrik/re2c/commit/7c6b5c955eae45c4b4c4bfd9f52efa9bbd09e2e3 that makes s-tag and m-tag variables non-overlapping by adding an "m" affix to the latter. This still allows sharing tag variables of the same kind between conditions, which saves one variable in your case.

With this fix your example compiles and runs like this:

waiting
waiting
waiting
waiting
waiting
media type: application/json
    (charset) = ("utf\"-8")
done
Error exit: 1

It seems to lex the input correctly. I looked a bit why it returns 1 but I think there is an rc = 0 missing in the CON_STATUS_DONE case.

cyanogilvie commented 2 years ago

Thanks for the fast response! That's great, it works perfectly for me now.

Yeah, the CON_STATUS_DONE handling is wrong, the example code was trimmed to the point of being meaningless, just enough to exercise the issue. The project it comes from has it deeply embedded in the IO layer (I would put it in the kernel using eBPF and directly lex the socket packet buffers if the validator would allow loops that can be proven to terminate).

re2c is my new favourite thing - with better than cycle-per-byte throughput nearly everything is starting to look like a target for it.