Chevrotain / chevrotain

Parser Building Toolkit for JavaScript
https://chevrotain.io
Apache License 2.0
2.44k stars 199 forks source link

Having trouble making a recursive parser #2025

Closed knpwrs closed 2 months ago

knpwrs commented 2 months ago

Continuing #2017: I'm working on a parser for liquid templates. Given the following template:

{% if product.type == "Shirt" or product.type == "Shoes" %}
  This is a shirt or a pair of shoes.
{% endif %}

I get the following tokens:

{
  "errors": [],
  "groups": {},
  "tokens": [
    {
      "endColumn": 2,
      "endLine": 1,
      "endOffset": 1,
      "image": "{%",
      "startColumn": 1,
      "startLine": 1,
      "startOffset": 0,
      "tokenType": {
        "CATEGORIES": [],
        "PATTERN": /\\{%-\\?/,
        "PUSH_MODE": "tag",
        "categoryMatches": [],
        "categoryMatchesMap": {},
        "isParent": false,
        "name": "TagStart",
        "tokenTypeIdx": 10,
      },
      "tokenTypeIdx": 10,
    },
    {
      "endColumn": 5,
      "endLine": 1,
      "endOffset": 4,
      "image": "if",
      "startColumn": 4,
      "startLine": 1,
      "startOffset": 3,
      "tokenType": {
        "CATEGORIES": [],
        "LONGER_ALT": {
          "CATEGORIES": [],
          "PATTERN": /\\[a-z_-\\]\\(\\?:\\[a-z0-9\\._-\\]\\|\\\\\\[\\(\\?:-\\?\\\\d\\+\\|"\\.\\+"\\|\\[a-z_-\\]\\+\\)\\]\\)\\*/i,
          "categoryMatches": [],
          "categoryMatchesMap": {},
          "isParent": false,
          "name": "Identifier",
          "tokenTypeIdx": 4,
        },
        "PATTERN": "if",
        "categoryMatches": [],
        "categoryMatchesMap": {},
        "isParent": false,
        "name": "ControlIf",
        "tokenTypeIdx": 19,
      },
      "tokenTypeIdx": 19,
    },
    {
      "endColumn": 18,
      "endLine": 1,
      "endOffset": 17,
      "image": "product.type",
      "startColumn": 7,
      "startLine": 1,
      "startOffset": 6,
      "tokenType": {
        "CATEGORIES": [],
        "PATTERN": /\\[a-z_-\\]\\(\\?:\\[a-z0-9\\._-\\]\\|\\\\\\[\\(\\?:-\\?\\\\d\\+\\|"\\.\\+"\\|\\[a-z_-\\]\\+\\)\\]\\)\\*/i,
        "categoryMatches": [],
        "categoryMatchesMap": {},
        "isParent": false,
        "name": "Identifier",
        "tokenTypeIdx": 4,
      },
      "tokenTypeIdx": 4,
    },
    {
      "endColumn": 21,
      "endLine": 1,
      "endOffset": 20,
      "image": "==",
      "startColumn": 20,
      "startLine": 1,
      "startOffset": 19,
      "tokenType": {
        "CATEGORIES": [],
        "LONGER_ALT": {
          "CATEGORIES": [],
          "PATTERN": /\\[a-z_-\\]\\(\\?:\\[a-z0-9\\._-\\]\\|\\\\\\[\\(\\?:-\\?\\\\d\\+\\|"\\.\\+"\\|\\[a-z_-\\]\\+\\)\\]\\)\\*/i,
          "categoryMatches": [],
          "categoryMatchesMap": {},
          "isParent": false,
          "name": "Identifier",
          "tokenTypeIdx": 4,
        },
        "PATTERN": /,\\|:\\|==\\|!=\\|>\\|<\\|>=\\|<=\\|=\\|or\\|and\\|contains\\|liquid\\|echo\\|render\\|with\\|as\\|for\\|include/,
        "categoryMatches": [],
        "categoryMatchesMap": {},
        "isParent": false,
        "name": "Operator",
        "tokenTypeIdx": 41,
      },
      "tokenTypeIdx": 41,
    },
    {
      "endColumn": 29,
      "endLine": 1,
      "endOffset": 28,
      "image": ""Shirt"",
      "startColumn": 23,
      "startLine": 1,
      "startOffset": 22,
      "tokenType": {
        "CATEGORIES": [],
        "PATTERN": /"\\.\\+\\?"\\|'\\.\\+\\?'/,
        "categoryMatches": [],
        "categoryMatchesMap": {},
        "isParent": false,
        "name": "String",
        "tokenTypeIdx": 5,
      },
      "tokenTypeIdx": 5,
    },
    {
      "endColumn": 32,
      "endLine": 1,
      "endOffset": 31,
      "image": "or",
      "startColumn": 31,
      "startLine": 1,
      "startOffset": 30,
      "tokenType": {
        "CATEGORIES": [],
        "LONGER_ALT": {
          "CATEGORIES": [],
          "PATTERN": /\\[a-z_-\\]\\(\\?:\\[a-z0-9\\._-\\]\\|\\\\\\[\\(\\?:-\\?\\\\d\\+\\|"\\.\\+"\\|\\[a-z_-\\]\\+\\)\\]\\)\\*/i,
          "categoryMatches": [],
          "categoryMatchesMap": {},
          "isParent": false,
          "name": "Identifier",
          "tokenTypeIdx": 4,
        },
        "PATTERN": /,\\|:\\|==\\|!=\\|>\\|<\\|>=\\|<=\\|=\\|or\\|and\\|contains\\|liquid\\|echo\\|render\\|with\\|as\\|for\\|include/,
        "categoryMatches": [],
        "categoryMatchesMap": {},
        "isParent": false,
        "name": "Operator",
        "tokenTypeIdx": 41,
      },
      "tokenTypeIdx": 41,
    },
    {
      "endColumn": 45,
      "endLine": 1,
      "endOffset": 44,
      "image": "product.type",
      "startColumn": 34,
      "startLine": 1,
      "startOffset": 33,
      "tokenType": {
        "CATEGORIES": [],
        "PATTERN": /\\[a-z_-\\]\\(\\?:\\[a-z0-9\\._-\\]\\|\\\\\\[\\(\\?:-\\?\\\\d\\+\\|"\\.\\+"\\|\\[a-z_-\\]\\+\\)\\]\\)\\*/i,
        "categoryMatches": [],
        "categoryMatchesMap": {},
        "isParent": false,
        "name": "Identifier",
        "tokenTypeIdx": 4,
      },
      "tokenTypeIdx": 4,
    },
    {
      "endColumn": 48,
      "endLine": 1,
      "endOffset": 47,
      "image": "==",
      "startColumn": 47,
      "startLine": 1,
      "startOffset": 46,
      "tokenType": {
        "CATEGORIES": [],
        "LONGER_ALT": {
          "CATEGORIES": [],
          "PATTERN": /\\[a-z_-\\]\\(\\?:\\[a-z0-9\\._-\\]\\|\\\\\\[\\(\\?:-\\?\\\\d\\+\\|"\\.\\+"\\|\\[a-z_-\\]\\+\\)\\]\\)\\*/i,
          "categoryMatches": [],
          "categoryMatchesMap": {},
          "isParent": false,
          "name": "Identifier",
          "tokenTypeIdx": 4,
        },
        "PATTERN": /,\\|:\\|==\\|!=\\|>\\|<\\|>=\\|<=\\|=\\|or\\|and\\|contains\\|liquid\\|echo\\|render\\|with\\|as\\|for\\|include/,
        "categoryMatches": [],
        "categoryMatchesMap": {},
        "isParent": false,
        "name": "Operator",
        "tokenTypeIdx": 41,
      },
      "tokenTypeIdx": 41,
    },
    {
      "endColumn": 56,
      "endLine": 1,
      "endOffset": 55,
      "image": ""Shoes"",
      "startColumn": 50,
      "startLine": 1,
      "startOffset": 49,
      "tokenType": {
        "CATEGORIES": [],
        "PATTERN": /"\\.\\+\\?"\\|'\\.\\+\\?'/,
        "categoryMatches": [],
        "categoryMatchesMap": {},
        "isParent": false,
        "name": "String",
        "tokenTypeIdx": 5,
      },
      "tokenTypeIdx": 5,
    },
    {
      "endColumn": 59,
      "endLine": 1,
      "endOffset": 58,
      "image": "%}",
      "startColumn": 58,
      "startLine": 1,
      "startOffset": 57,
      "tokenType": {
        "CATEGORIES": [],
        "PATTERN": /-\\?%\\}/,
        "POP_MODE": true,
        "categoryMatches": [],
        "categoryMatchesMap": {},
        "isParent": false,
        "name": "TagEnd",
        "tokenTypeIdx": 14,
      },
      "tokenTypeIdx": 14,
    },
    {
      "endColumn": 1,
      "endLine": 2,
      "endOffset": 97,
      "image": "
  This is a shirt or a pair of shoes.
",
      "startColumn": 60,
      "startLine": 1,
      "startOffset": 59,
      "tokenType": {
        "CATEGORIES": [],
        "LINE_BREAKS": true,
        "PATTERN": /\\(\\[\\^\\{\\]\\|\\(\\{\\[\\^%\\{\\]\\)\\)\\+/,
        "categoryMatches": [],
        "categoryMatchesMap": {},
        "isParent": false,
        "name": "Text",
        "tokenTypeIdx": 11,
      },
      "tokenTypeIdx": 11,
    },
    {
      "endColumn": 2,
      "endLine": 3,
      "endOffset": 99,
      "image": "{%",
      "startColumn": 1,
      "startLine": 3,
      "startOffset": 98,
      "tokenType": {
        "CATEGORIES": [],
        "PATTERN": /\\{%-\\?/,
        "PUSH_MODE": "tag",
        "categoryMatches": [],
        "categoryMatchesMap": {},
        "isParent": false,
        "name": "TagStart",
        "tokenTypeIdx": 10,
      },
      "tokenTypeIdx": 10,
    },
    {
      "endColumn": 8,
      "endLine": 3,
      "endOffset": 105,
      "image": "endif",
      "startColumn": 4,
      "startLine": 3,
      "startOffset": 101,
      "tokenType": {
        "CATEGORIES": [],
        "LONGER_ALT": {
          "CATEGORIES": [],
          "PATTERN": /\\[a-z_-\\]\\(\\?:\\[a-z0-9\\._-\\]\\|\\\\\\[\\(\\?:-\\?\\\\d\\+\\|"\\.\\+"\\|\\[a-z_-\\]\\+\\)\\]\\)\\*/i,
          "categoryMatches": [],
          "categoryMatchesMap": {},
          "isParent": false,
          "name": "Identifier",
          "tokenTypeIdx": 4,
        },
        "PATTERN": "endif",
        "categoryMatches": [],
        "categoryMatchesMap": {},
        "isParent": false,
        "name": "ControlEndif",
        "tokenTypeIdx": 22,
      },
      "tokenTypeIdx": 22,
    },
    {
      "endColumn": 11,
      "endLine": 3,
      "endOffset": 108,
      "image": "%}",
      "startColumn": 10,
      "startLine": 3,
      "startOffset": 107,
      "tokenType": {
        "CATEGORIES": [],
        "PATTERN": /-\\?%\\}/,
        "POP_MODE": true,
        "categoryMatches": [],
        "categoryMatchesMap": {},
        "isParent": false,
        "name": "TagEnd",
        "tokenTypeIdx": 14,
      },
      "tokenTypeIdx": 14,
    },
    {
      "endColumn": 12,
      "endLine": 3,
      "endOffset": 109,
      "image": "
",
      "startColumn": 12,
      "startLine": 3,
      "startOffset": 109,
      "tokenType": {
        "CATEGORIES": [],
        "LINE_BREAKS": true,
        "PATTERN": /\\(\\[\\^\\{\\]\\|\\(\\{\\[\\^%\\{\\]\\)\\)\\+/,
        "categoryMatches": [],
        "categoryMatchesMap": {},
        "isParent": false,
        "name": "Text",
        "tokenTypeIdx": 11,
      },
      "tokenTypeIdx": 11,
    },
  ],
}

I am trying to make a parser for these tokens. I find it intuitive to consider the entire text a "document," and the body of the if statement to be a document in its own right. The concrete syntax tree should resemble this pseudocode: doc(if(expression(product.type == "Shirt" or product.type == "Shoes"), doc("This is a shirt or a pair of shoes."))).

So far I've put the following together:

class LiquidParser extends CstParser {
  constructor() {
    super(allTokens)
    this.performSelfAnalysis()
  }

  public doc = this.RULE('doc', () => {
    this.MANY({
      DEF: () => {
        this.OR([
          { ALT: () => this.CONSUME(TextToken) },
          // { ALT: () => this.SUBRULE(this.object) },
          // { ALT: () => this.SUBRULE(this.controlFor) },
          { ALT: () => this.SUBRULE(this.controlIf) },
        ])
      },
    })
  })

  public controlIf = this.RULE('controlIf', () => {
    this.CONSUME(TagStart)
    this.CONSUME(ControlIf)
    this.SUBRULE(this.expression)
    this.CONSUME(TagEnd)

    this.SUBRULE(this.doc)

    this.CONSUME2(TagStart)
    this.CONSUME(ControlEndif)
    this.CONSUME2(TagEnd)
  })

  public expression = this.RULE('expression', () => {
    this.MANY_SEP1({
      SEP: Operator,
      DEF: () => {
        this.OR([
          { ALT: () => this.CONSUME(Identifier) },
          { ALT: () => this.SUBRULE(this.primitive) },
          { ALT: () => this.CONSUME(RangeToken) },
        ])
      },
    })

    this.OPTION(() => {
      this.CONSUME(Pipe)
      this.MANY_SEP2({
        SEP: Pipe,
        DEF: () => {
          this.SUBRULE(this.filterInstance)
        },
      })
    })
  })
}

Looking at the JSON examples in this repository, it seems to be that recursive parsing should be possible (I can see object -> objectItem -> value -> object). When I run my parser, however, I get the following:

NoViableAltException: Expecting: one of these possible Token sequences:
  1. [Text]
  2. [TagStart, ControlIf]
but found: '{%'

It seems to me like chevrotain is getting into the sub document (the body of the if statement), and then not exiting back up to consume the TagStart, ControlEndIf, and TagEnd tokens for the controlIf rule.

I've tried making a sub document rule, but it doesn't work and even if it did I think I would have to create as many sub document rules as levels of nesting I plan to support.

Am I doing something wrong? Are there any tips or tricks I can use to debug or fix this?

EDIT: It's probably worth noting that I'm calling the doc method on the parser and passing in the aforementioned tokens.