lark-parser / lark

Lark is a parsing toolkit for Python, built with a focus on ergonomics, performance and modularity.
MIT License
4.86k stars 413 forks source link

StandAlone Parsers return different ASTs with same DSL input #349

Closed simplelife963 closed 5 years ago

simplelife963 commented 5 years ago
from python_parser import Lark_StandAlone

class PythonIndenter(Indenter):
    NL_type = '_NEWLINE'
    OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE']
    CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE']
    INDENT_type = '_INDENT'
    DEDENT_type = '_DEDENT'
    tab_len = 8

simple_stmts = """
a=1
b=2

"""
kwargs = dict(postlex=PythonIndenter())

parser1 = Lark_StandAlone(**kwargs)
print(parser1.parse(simple_stmts))

parser2 = Lark_StandAlone(**kwargs)
print(parser2.parse(simple_stmts))

Two parsers are both instanced from Lark_StandAlone class and parse the same string, however, the results are two different ASTs shown as below, could you please check it for me? Thanks.

Tree(file_input, [Tree(expr_stmt, [Tree(var, [Token(NAME, 'a')]), Tree(number, [Token(DEC_NUMBER, '1')])]), Tree(expr_stmt, [Tree(var, [Token(NAME, 'b')]), Tree(number, [Token(DEC_NUMBER, '2')])])])

Tree(_cb4_NonTerminal('file_input'), [Tree(_cb448_NonTerminal('__anon_star_0'), [Tree(_cb448_NonTerminal('__anon_star_0'), [Tree(_cb447_NonTerminal('__anon_star_0'), [Token(_NEWLINE, '\n')]), Tree(_cb141_NonTerminal('stmt'), [Tree(_cb146_NonTerminal('simple_stmt'), [Tree(_cb150_NonTerminal('small_stmt'), [Tree(_cb157_NonTerminal('expr_stmt'), [Tree(_cb162_NonTerminal('testlist_star_expr'), [Tree(_cb263_NonTerminal('test'), [Tree(_cb272_NonTerminal('or_test'), [Tree(_cb274_NonTerminal('and_test'), [Tree(_cb277_NonTerminal('not_test'), [Tree(_cb278_NonTerminal('comparison'), [Tree(_cb281_NonTerminal('expr'), [Tree(_cb284_NonTerminal('xor_expr'), [Tree(_cb286_NonTerminal('and_expr'), [Tree(_cb287_NonTerminal('shift_expr'), [Tree(_cb289_NonTerminal('arith_expr'), [Tree(_cb292_NonTerminal('term'), [Tree(_cb293_NonTerminal('factor'), [Tree(_cb318_NonTerminal('power'), [Tree(_cb320_NonTerminal('await_expr'), [Tree(_cb323_NonTerminal('atom_expr'), [Tree(_cb338_NonTerminal('atom'), [Token(NAME, 'a')])])])])])])])])])])])])])])])])]), Tree(_cb463_NonTerminal('__anon_star_6'), [Token(EQUAL, '='), Tree(_cb162_NonTerminal('testlist_star_expr'), [Tree(_cb263_NonTerminal('test'), [Tree(_cb272_NonTerminal('or_test'), [Tree(_cb274_NonTerminal('and_test'), [Tree(_cb277_NonTerminal('not_test'), [Tree(_cb278_NonTerminal('comparison'), [Tree(_cb281_NonTerminal('expr'), [Tree(_cb284_NonTerminal('xor_expr'), [Tree(_cb286_NonTerminal('and_expr'), [Tree(_cb287_NonTerminal('shift_expr'), [Tree(_cb289_NonTerminal('arith_expr'), [Tree(_cb292_NonTerminal('term'), [Tree(_cb293_NonTerminal('factor'), [Tree(_cb318_NonTerminal('power'), [Tree(_cb320_NonTerminal('await_expr'), [Tree(_cb323_NonTerminal('atom_expr'), [Tree(_cb340_NonTerminal('atom'), [Tree(_cb440_NonTerminal('number'), [Token(DEC_NUMBER, '1')])])])])])])])])])])])])])])])])])])])])]), Token(_NEWLINE, '\n')])])]), Tree(_cb141_NonTerminal('stmt'), [Tree(_cb146_NonTerminal('simple_stmt'), [Tree(_cb150_NonTerminal('small_stmt'), [Tree(_cb157_NonTerminal('expr_stmt'), [Tree(_cb162_NonTerminal('testlist_star_expr'), [Tree(_cb263_NonTerminal('test'), [Tree(_cb272_NonTerminal('or_test'), [Tree(_cb274_NonTerminal('and_test'), [Tree(_cb277_NonTerminal('not_test'), [Tree(_cb278_NonTerminal('comparison'), [Tree(_cb281_NonTerminal('expr'), [Tree(_cb284_NonTerminal('xor_expr'), [Tree(_cb286_NonTerminal('and_expr'), [Tree(_cb287_NonTerminal('shift_expr'), [Tree(_cb289_NonTerminal('arith_expr'), [Tree(_cb292_NonTerminal('term'), [Tree(_cb293_NonTerminal('factor'), [Tree(_cb318_NonTerminal('power'), [Tree(_cb320_NonTerminal('await_expr'), [Tree(_cb323_NonTerminal('atom_expr'), [Tree(_cb338_NonTerminal('atom'), [Token(NAME, 'b')])])])])])])])])])])])])])])])])]), Tree(_cb463_NonTerminal('__anon_star_6'), [Token(EQUAL, '='), Tree(_cb162_NonTerminal('testlist_star_expr'), [Tree(_cb263_NonTerminal('test'), [Tree(_cb272_NonTerminal('or_test'), [Tree(_cb274_NonTerminal('and_test'), [Tree(_cb277_NonTerminal('not_test'), [Tree(_cb278_NonTerminal('comparison'), [Tree(_cb281_NonTerminal('expr'), [Tree(_cb284_NonTerminal('xor_expr'), [Tree(_cb286_NonTerminal('and_expr'), [Tree(_cb287_NonTerminal('shift_expr'), [Tree(_cb289_NonTerminal('arith_expr'), [Tree(_cb292_NonTerminal('term'), [Tree(_cb293_NonTerminal('factor'), [Tree(_cb318_NonTerminal('power'), [Tree(_cb320_NonTerminal('await_expr'), [Tree(_cb323_NonTerminal('atom_expr'), [Tree(_cb340_NonTerminal('atom'), [Tree(_cb440_NonTerminal('number'), [Token(DEC_NUMBER, '2')])])])])])])])])])])])])])])])])])])])])]), Token(_NEWLINE, '\n\n')])])])])
erezsh commented 5 years ago

Are you using the latest version (v0.7.0)?

Can you try it with the latest commit on master? I just made a big change to how the standalone generator works.

simplelife963 commented 5 years ago

I re-generated the stand-alone parser by using the latest commit on master and it solved my problem and works perfectly. Really appreciated!

simplelife963 commented 5 years ago

update, it failed to parse 'if' statements, for example

simple_stmts = """
if a > 2:
    a = 1

"""

The error message:

UnexpectedCharacters: No terminal defined for 'a' at line 2 col 4
if a > 2:
Expecting: {'__ANON_16', '__ANON_4', 'FROM', '__ANON_1', 'OR', 'STAR', '__IGNORE_1', '__ANON_11', '__ANON_6', '__ANON_5', 'SLASH', '__ANON_9', 'MINUS', 'LSQB', '__ANON_22', 'ASYNC', 'FOR', '__ANON_18', 'COMMA', 'RPAR', '__ANON_2', 'AT', '__ANON_13', '__IGNORE_0', 'IS', 'IF', 'DOT', 'LESSTHAN', 'IN', '__ANON_17', 'SEMICOLON', 'COMMENT', '__ANON_21', '__ANON_12', 'MORETHAN', '__ANON_8', 'AS', '__ANON_19', 'RSQB', 'AMPERSAND', 'LPAR', 'ELSE', '__ANON_14', 'CIRCUMFLEX', 'AND', '__ANON_15', 'NOT', '__ANON_7', 'RBRACE', 'EQUAL', '__ANON_10', 'PLUS', 'VBAR', 'PERCENT', '__ANON_3', '_NEWLINE', 'COLON', '__ANON_20'}
erezsh commented 5 years ago

Can you post a minimal example that works with a Lark instance, but fails as a standalone?

simplelife963 commented 5 years ago
from lark.indenter import Indenter

# v001.py is generated by using the generator in version 0.7.0
from v001 import Lark_StandAlone as LS1

# v002.py is generated by using the latest commit on master
from v002 import Lark_StandAlone as LS2

class PythonIndenter(Indenter):
    NL_type = '_NEWLINE'
    OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE']
    CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE']
    INDENT_type = '_INDENT'
    DEDENT_type = '_DEDENT'
    tab_len = 8

simple_stmts = """
if a > 2:
    a = 1

"""

kwargs = dict(postlex=PythonIndenter())

def run_ls1():
    print(LS1(**kwargs).parse(simple_stmts))

def run_ls2():
    print(LS2(**kwargs).parse(simple_stmts))

def run_ls1_twice():
    run_ls1()
    run_ls1()

if __name__ == '__main__':
    # parse properly
    run_ls1()

    # return two different ASTs
    # run_ls1_twice()

    # return error
    # run_ls2()

run_ls1() returns

Tree(file_input, [Tree(compound_stmt, [Tree(if_stmt, [Tree(comparison, [Tree(var, [Token(NAME, 'a')]), Token(MORETHAN, '>'), Tree(number, [Token(DEC_NUMBER, '2')])]), Tree(suite, [Tree(expr_stmt, [Tree(var, [Token(NAME, 'a')]), Tree(number, [Token(DEC_NUMBER, '1')])])])])])])

run_ls1_twice() returns

Tree(file_input, [Tree(compound_stmt, [Tree(if_stmt, [Tree(comparison, [Tree(var, [Token(NAME, 'a')]), Token(MORETHAN, '>'), Tree(number, [Token(DEC_NUMBER, '2')])]), Tree(suite, [Tree(expr_stmt, [Tree(var, [Token(NAME, 'a')]), Tree(number, [Token(DEC_NUMBER, '1')])])])])])])
Tree(_cb4_NonTerminal('file_input'), [Tree(_cb448_NonTerminal('__anon_star_0'), [Tree(_cb447_NonTerminal('__anon_star_0'), [Token(_NEWLINE, '\n')]), Tree(_cb142_NonTerminal('stmt'), [Tree(_cb237_NonTerminal('compound_stmt'), [Tree(_cb244_NonTerminal('if_stmt'), [Token(IF, 'if'), Tree(_cb263_NonTerminal('test'), [Tree(_cb272_NonTerminal('or_test'), [Tree(_cb274_NonTerminal('and_test'), [Tree(_cb277_NonTerminal('not_test'), [Tree(_cb279_NonTerminal('comparison'), [Tree(_cb281_NonTerminal('expr'), [Tree(_cb284_NonTerminal('xor_expr'), [Tree(_cb286_NonTerminal('and_expr'), [Tree(_cb287_NonTerminal('shift_expr'), [Tree(_cb289_NonTerminal('arith_expr'), [Tree(_cb292_NonTerminal('term'), [Tree(_cb293_NonTerminal('factor'), [Tree(_cb318_NonTerminal('power'), [Tree(_cb320_NonTerminal('await_expr'), [Tree(_cb323_NonTerminal('atom_expr'), [Tree(_cb338_NonTerminal('atom'), [Token(NAME, 'a')])])])])])])])])])])]), Tree(_cb493_NonTerminal('__anon_star_19'), [Tree(_cb308_NonTerminal('_comp_op'), [Token(MORETHAN, '>')]), Tree(_cb281_NonTerminal('expr'), [Tree(_cb284_NonTerminal('xor_expr'), [Tree(_cb286_NonTerminal('and_expr'), [Tree(_cb287_NonTerminal('shift_expr'), [Tree(_cb289_NonTerminal('arith_expr'), [Tree(_cb292_NonTerminal('term'), [Tree(_cb293_NonTerminal('factor'), [Tree(_cb318_NonTerminal('power'), [Tree(_cb320_NonTerminal('await_expr'), [Tree(_cb323_NonTerminal('atom_expr'), [Tree(_cb340_NonTerminal('atom'), [Tree(_cb440_NonTerminal('number'), [Token(DEC_NUMBER, '2')])])])])])])])])])])])])])])])])])]), Token(COLON, ':'), Tree(_cb262_NonTerminal('suite'), [Token(_NEWLINE, '\n    '), Token(_INDENT, '    '), Tree(_cb487_NonTerminal('__anon_plus_16'), [Tree(_cb141_NonTerminal('stmt'), [Tree(_cb146_NonTerminal('simple_stmt'), [Tree(_cb150_NonTerminal('small_stmt'), [Tree(_cb157_NonTerminal('expr_stmt'), [Tree(_cb162_NonTerminal('testlist_star_expr'), [Tree(_cb263_NonTerminal('test'), [Tree(_cb272_NonTerminal('or_test'), [Tree(_cb274_NonTerminal('and_test'), [Tree(_cb277_NonTerminal('not_test'), [Tree(_cb278_NonTerminal('comparison'), [Tree(_cb281_NonTerminal('expr'), [Tree(_cb284_NonTerminal('xor_expr'), [Tree(_cb286_NonTerminal('and_expr'), [Tree(_cb287_NonTerminal('shift_expr'), [Tree(_cb289_NonTerminal('arith_expr'), [Tree(_cb292_NonTerminal('term'), [Tree(_cb293_NonTerminal('factor'), [Tree(_cb318_NonTerminal('power'), [Tree(_cb320_NonTerminal('await_expr'), [Tree(_cb323_NonTerminal('atom_expr'), [Tree(_cb338_NonTerminal('atom'), [Token(NAME, 'a')])])])])])])])])])])])])])])])])]), Tree(_cb463_NonTerminal('__anon_star_6'), [Token(EQUAL, '='), Tree(_cb162_NonTerminal('testlist_star_expr'), [Tree(_cb263_NonTerminal('test'), [Tree(_cb272_NonTerminal('or_test'), [Tree(_cb274_NonTerminal('and_test'), [Tree(_cb277_NonTerminal('not_test'), [Tree(_cb278_NonTerminal('comparison'), [Tree(_cb281_NonTerminal('expr'), [Tree(_cb284_NonTerminal('xor_expr'), [Tree(_cb286_NonTerminal('and_expr'), [Tree(_cb287_NonTerminal('shift_expr'), [Tree(_cb289_NonTerminal('arith_expr'), [Tree(_cb292_NonTerminal('term'), [Tree(_cb293_NonTerminal('factor'), [Tree(_cb318_NonTerminal('power'), [Tree(_cb320_NonTerminal('await_expr'), [Tree(_cb323_NonTerminal('atom_expr'), [Tree(_cb340_NonTerminal('atom'), [Tree(_cb440_NonTerminal('number'), [Token(DEC_NUMBER, '1')])])])])])])])])])])])])])])])])])])])])]), Token(_NEWLINE, '\n\n')])])]), Token(_DEDENT, '')])])])])])])

run_ls2() returns

v002.UnexpectedCharacters: No terminal defined for 'a' at line 2 col 4

if a > 2:

Expecting: {'OR', 'AT', '__ANON_1', '__ANON_4', 'DOT', '__ANON_19', 'FOR', 'COMMENT', 'IS', '__ANON_14', '__ANON_3', '__ANON_10', '__ANON_20', 'ASYNC', 'CIRCUMFLEX', 'COLON', 'PERCENT', 'RSQB', '__ANON_15', 'ELSE', 'IN', 'EQUAL', '__ANON_17', 'MINUS', 'PLUS', 'COMMA', 'FROM', '_NEWLINE', 'AND', '__ANON_5', '__ANON_9', '__ANON_11', 'SLASH', 'LESSTHAN', 'STAR', '__ANON_12', 'IF', 'RPAR', '__IGNORE_1', '__ANON_6', 'RBRACE', 'MORETHAN', '__ANON_7', '__ANON_16', '__ANON_22', 'AS', '__ANON_8', 'LPAR', '__IGNORE_0', 'AMPERSAND', 'LSQB', 'NOT', '__ANON_2', 'SEMICOLON', '__ANON_21', '__ANON_18', '__ANON_13', 'VBAR'}

Thanks Erezsh

simplelife963 commented 5 years ago

We tried around and got a solution for v0.7.0 to keep parsing results consistent with stand-alone mode. Each time after building the AST, the program need to destroy the imported stand-alone module cache in sys.modules, and dynamically re-import it before instancing another Lark_StandAlone object. Hopefully this solution might help others.

from lark.indenter import Indenter
import sys
import importlib

# v001.py is generated by using the generator in version 0.7.0
# from v001 import Lark_StandAlone as LS1

class PythonIndenter(Indenter):
    NL_type = '_NEWLINE'
    OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE']
    CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE']
    INDENT_type = '_INDENT'
    DEDENT_type = '_DEDENT'
    tab_len = 8

simple_stmts = """
if a > 2:
    a = 1

"""

kwargs = dict(postlex=PythonIndenter())

def run_ls1():
    module_name = 'v001'

    # dynamically import stand-alone module
    module = importlib.import_module(module_name)

    # instance Lark_StandAlone Object
    parser = module.Lark_StandAlone(**kwargs)

    # parse input string
    print(parser.parse(simple_stmts))

    # destroy imported module cache
    del sys.modules[module_name]

def run_ls1_twice():
    run_ls1()
    run_ls1()

if __name__ == '__main__':
    run_ls1_twice()

Thus, the results are consistent and the parser work perfectly.

Tree(file_input, [Tree(compound_stmt, [Tree(if_stmt, [Tree(comparison, [Tree(var, [Token(NAME, 'a')]), Token(MORETHAN, '>'), Tree(number, [Token(DEC_NUMBER, '2')])]), Tree(suite, [Tree(expr_stmt, [Tree(var, [Token(NAME, 'a')]), Tree(number, [Token(DEC_NUMBER, '1')])])])])])])
Tree(file_input, [Tree(compound_stmt, [Tree(if_stmt, [Tree(comparison, [Tree(var, [Token(NAME, 'a')]), Token(MORETHAN, '>'), Tree(number, [Token(DEC_NUMBER, '2')])]), Tree(suite, [Tree(expr_stmt, [Tree(var, [Token(NAME, 'a')]), Tree(number, [Token(DEC_NUMBER, '1')])])])])])])
erezsh commented 5 years ago

I know what the problem is. I hope to fix it in the upcoming days.

erezsh commented 5 years ago

Everything should work now in the latest master. Please check and let me know!

simplelife963 commented 5 years ago

The parsing results are now consistent by using the latest codes on master. We really appreciated your work, Erezsh!