lark-parser / lark

Lark is a parsing toolkit for Python, built with a focus on ergonomics, performance and modularity.
MIT License
4.93k stars 416 forks source link

Here is what I'm trying to do with regexes, and why I should use Lark #484

Closed enjoysmath closed 5 years ago

enjoysmath commented 5 years ago
from lark import Transformer, Lark

class Var:
   def __init__(self, s:str, type:str=None):
      self._str = s
      self._type = type

   def __str__(self):
      return self._str

   def type(self):
      return self._type

class TextString:
   def __init__(self, s:str):
      self._str = s

   def __str__(self):
      return self._str

class LaTeXString:
   def __init__(self, s:str):
      self._str = s

   def __str__(self):
      return self._str

class VarParser:
   space_regex = re.compile(r"\s\s+")
   text_regex = re.compile(r"(\\(text|textbf|operatorname){[^}]+})")
   latex_regex = re.compile(r"(\$[^$]+\$|\$\$[^$]+\$\$)")  
   var_regex = re.compile(
      r"[a-zA-Z]|\\alpha|\\beta|\\gamma|\\delta|\\epsilon|\\zeta|\\eta|"
      r"\\theta|\\iota|\\kappa|\\lambda|\\mu|\\xi|\\omicron|\\pi|\\rho|"
      r"\\sigma|\\tau|\\upsilon|\\phi|\\psi|\\chi|\\omega|"
      r"\\Alpha|\\Beta|\\Gamma|\\Delta|\\Epsilon|\\Zeta|\\Eta|\\Theta|"
      r"\\Iota|\\Kappa|\\Lambda|\\Mu|\\Xi|\\Omicron|\\Pi|\\Rho|\\Sigma|"
      r"\\Tau|\\Upsilon|\\Phi|\\Psi|\\Chi|\\Omega")

   def to_single_space(self, s):
      return " ".join(s.split())

   def parse_latex_parts(self, s):
      s = self.to_single_space(s)      
      match_iter = self.latex_regex.finditer(s)
      parts = []
      start = 0
      for match in match_iter:
         if match.span()[0] > 0:
            parts.append(s[start : match.span()[0]])
         parts.append(LaTeXString(match.group()))
         start = match.span()[1] + 1
      if start < len(s):
         parts.append(s[start:])
      return parts

   def parse_text_parts(self, latex_parts):
      text_parts = 

if __name__ == '__main__':
   while True:
      parser = VarParser()
      s = input("s=")
      list = parser.parse(s)
      print(list)

As you can see the code is becoming more complex than necessary given that Lark exists.

$\text{abc} a$ => ["\text{abc} ", Var("a")] is the result I want.

Anything surrounded by $ or $$ in MathJax is considered LaTeX. Anything latin or greek (\alpha e.g.) character I encounter inside dollar signs will be a variable. Except \text{...} might occur inside of dollar signs, in which case anthing inside of ... should not be parsed as a variable, but the entire string \text{...} is left alone. Same goes for \textbf and \operatorname.

So as you can see, using regexes has led me to the above code, in fact the simplest way you could do it, and the code is not finished (it doesn't run because I didn't finish it). Therefore because of all this complicatedness, I think Lark suits me best. However, I've tried several times to do this in Lark and failed for one reason or another.

I will be posting the Lark version in about 30 mins. So keep this open, please.

enjoysmath commented 5 years ago

Here's the code using Lark:


from lark import Lark, Transformer
from lark.lexer import Token

class Parser:
   grammar = """
      ?start: string
      ?string: (latex_string | any_str)+
      ?latex_string: block_latex
                   | inline_latex
      ?inline_latex: "$" inner_latex+ "$"
      ?block_latex: "$$" inner_latex+ "$$"
      ?inner_latex: variable 
                  | integer
                  | text_block
                  | any_str
      ?variable: (LATIN | greek)
      ?text_block: "\\\\" TEXT_COMMAND "{" ANY_STR "}"
      integer: SIGNED_INT
      ?greek: GREEK_LOWER     -> greek
            | GREEK_UPPER     -> greek
      GREEK_LOWER: /\\\\alpha|\\\\beta|\\\\gamma|\\\\delta|\\\\epsilon|\\\\zeta|\\\\eta|\\\\theta/
                  | /\\\\iota|\\\\kappa|\\\\lambda|\\\\mu|\\\\xi|\\\\omicron|\\\\pi|\\\\rho|\\\\sigma/
                  | /\\\\tau|\\\\upsilon|\\\\phi|\\\\psi|\\\\chi|\\\\omega/
      GREEK_UPPER: /\\\\Alpha|\\\\Beta|\\\\Gamma|\\\\Delta|\\\\Epsilon|\\\\Zeta|\\\\Eta|\\\\Theta/
                 | /\\\\Iota|\\\\Kappa|\\\\Lambda|\\\\Mu|\\\\Xi|\\\\Omicron|\\\\Pi|\\\\Rho|\\\\Sigma/
                 | /\\\\Tau|\\\\Upsilon|\\\\Phi|\\\\Psi|\\\\Chi|\\\\Omega/
      TEXT_COMMAND: /text|textbf|operatorname/
      LATIN: /[a-zA-Z]/
      any_str: ANY_STR
      ANY_STR: /[^{}$]+/
      %import common.SIGNED_INT
      %import common.WS_INLINE
      %ignore WS_INLINE
   """

   def __init__(self, new=True):
      if new is None:
         new = True
      if new:
         self._transformer = None
         self._parser = None
         Parser.setup(self)

   def setup(self):      
      self._transformer = ParseTreeTransformer()
      self._parser = Lark(self.grammar)    

   def parse_to_list(self, s):
      return self._transformer.transform(self._parser.parse(s))

class ParseTreeTransformer(Transformer):
   pass

if __name__ == '__main__':
   parser = Parser()

   while True:
      s = input("s=")
      result = parser.parse_to_list(s)
      print(result)

Feel free to run it, and type in "Let $G$ be a group." It will parse, except only recognize $G$ as an any_str and not a variable. Therefore there's some bug in Lark that's causing any_str to take priority over my more detailed rules.

Any ideas?

erezsh commented 5 years ago

It seems to work as it is:

s=Let $G$ be a group
string
  any_str       Let
  inline_latex
    variable    G
  any_str        be a group

Also, you should use raw strings. It will reduce how many backslashes you have to type.

enjoysmath commented 5 years ago

@erezsh I'm getting G as an any_str on my side.

My Lark install is already up-to-date. Switching to PLY now.

I'm doing the parser generators musical chairs merry-go-round.

MegaIng commented 5 years ago

This grammar should work:

  ?start: string
  ?string: (latex_string | any_str)+
  ?latex_string: block_latex
               | inline_latex
  ?inline_latex: "$" inner_latex+ "$"
  ?block_latex: "$$" inner_latex+ "$$"
  ?inner_latex: variable 
              | integer
              | text_block
              | any_str_not_var
  ?variable: (LATIN | greek)
  ?text_block: "\\\\" TEXT_COMMAND "{" ANY_STR "}"
  integer: SIGNED_INT
  ?greek: GREEK_LOWER     -> greek
        | GREEK_UPPER     -> greek
  GREEK_LOWER: /\\\\alpha|\\\\beta|\\\\gamma|\\\\delta|\\\\epsilon|\\\\zeta|\\\\eta|\\\\theta/
              | /\\\\iota|\\\\kappa|\\\\lambda|\\\\mu|\\\\xi|\\\\omicron|\\\\pi|\\\\rho|\\\\sigma/
              | /\\\\tau|\\\\upsilon|\\\\phi|\\\\psi|\\\\chi|\\\\omega/
  GREEK_UPPER: /\\\\Alpha|\\\\Beta|\\\\Gamma|\\\\Delta|\\\\Epsilon|\\\\Zeta|\\\\Eta|\\\\Theta/
             | /\\\\Iota|\\\\Kappa|\\\\Lambda|\\\\Mu|\\\\Xi|\\\\Omicron|\\\\Pi|\\\\Rho|\\\\Sigma/
             | /\\\\Tau|\\\\Upsilon|\\\\Phi|\\\\Psi|\\\\Chi|\\\\Omega/
  TEXT_COMMAND: /text|textbf|operatorname/
  LATIN: /[a-zA-Z]/
  any_str: ANY_STR
  ANY_STR: /[^{}$]+/
  any_str_not_var: ANY_STR_NOT_VAR
  ANY_STR_NOT_VAR: /[^{}$a-zA-Z]+/
  %import common.SIGNED_INT
  %import common.WS_INLINE
  %ignore WS_INLINE

Cant test right now.

enjoysmath commented 5 years ago

This grammar should work:

  ?start: string
  ?string: (latex_string | any_str)+
  ?latex_string: block_latex
               | inline_latex
  ?inline_latex: "$" inner_latex+ "$"
  ?block_latex: "$$" inner_latex+ "$$"
  ?inner_latex: variable 
              | integer
              | text_block
              | any_str_not_var
  ?variable: (LATIN | greek)
  ?text_block: "\\\\" TEXT_COMMAND "{" ANY_STR "}"
  integer: SIGNED_INT
  ?greek: GREEK_LOWER     -> greek
        | GREEK_UPPER     -> greek
  GREEK_LOWER: /\\\\alpha|\\\\beta|\\\\gamma|\\\\delta|\\\\epsilon|\\\\zeta|\\\\eta|\\\\theta/
              | /\\\\iota|\\\\kappa|\\\\lambda|\\\\mu|\\\\xi|\\\\omicron|\\\\pi|\\\\rho|\\\\sigma/
              | /\\\\tau|\\\\upsilon|\\\\phi|\\\\psi|\\\\chi|\\\\omega/
  GREEK_UPPER: /\\\\Alpha|\\\\Beta|\\\\Gamma|\\\\Delta|\\\\Epsilon|\\\\Zeta|\\\\Eta|\\\\Theta/
             | /\\\\Iota|\\\\Kappa|\\\\Lambda|\\\\Mu|\\\\Xi|\\\\Omicron|\\\\Pi|\\\\Rho|\\\\Sigma/
             | /\\\\Tau|\\\\Upsilon|\\\\Phi|\\\\Psi|\\\\Chi|\\\\Omega/
  TEXT_COMMAND: /text|textbf|operatorname/
  LATIN: /[a-zA-Z]/
  any_str: ANY_STR
  ANY_STR: /[^{}$]+/
  any_str_not_var: ANY_STR_NOT_VAR
  ANY_STR_NOT_VAR: /[^{}$a-zA-Z]+/
  %import common.SIGNED_INT
  %import common.WS_INLINE
  %ignore WS_INLINE

Cant test right now.

PLY is way too complicated to use. I will stick things out with Lark for a while.

Anyway, that grammar has bugs:

from lark import Lark, Transformer

class UserInputParser:      
   user_input_grammar = r'''
      ?start: string
      ?string: (latex_string | any_str)+
      ?latex_string: block_latex
         | inline_latex
      ?inline_latex: "$" inner_latex+ "$"
      ?block_latex: "$$" inner_latex+ "$$"
      ?inner_latex: variable 
         | integer
         | text_block
         | any_str_not_var
      ?variable: (LATIN | greek)
      ?text_block: "\\\\" TEXT_COMMAND "{" ANY_STR "}"
      integer: SIGNED_INT
      ?greek: GREEK_LOWER     -> greek
         | GREEK_UPPER     -> greek
      GREEK_LOWER: /\\alpha|\\beta|\\gamma|\\delta|\\epsilon|\\zeta|\\eta|\\theta/
         | /\\iota|\\kappa|\\lambda|\\mu|\\xi|\\omicron|\\pi|\\rho|\\sigma/
         | /\\tau|\\upsilon|\\phi|\\psi|\\chi|\\omega/
      GREEK_UPPER: /\\Alpha|\\Beta|\\Gamma|\\Delta|\\Epsilon|\\Zeta|\\Eta|\\Theta/
         | /\\Iota|\\Kappa|\\Lambda|\\Mu|\\Xi|\\Omicron|\\Pi|\\Rho|\\Sigma/
         | /\\Tau|\\Upsilon|\\Phi|\\Psi|\\Chi|\\Omega/
      TEXT_COMMAND: /text|textbf|operatorname/
      LATIN: /[a-zA-Z]/
      any_str: ANY_STR
      ANY_STR: /[^{}$]+/
      any_str_not_var: ANY_STR_NOT_VAR
      ANY_STR_NOT_VAR: /[^{}$a-zA-Z]+/
      %import common.SIGNED_INT
      %import common.WS_INLINE
      %ignore WS_INLINE
   '''

   def __init__(self):
      self._parser = Lark(self.user_input_grammar, parser='lalr')

   def parse_to_list(self, s:str) -> list:
      return self._parser.parse(s)

if __name__ == '__main__':
   parser = UserInputParser()

   while True:
      s = input("s=")
      result = parser.parse_to_list(s)
      print(result)

Input:
Let $G$ be a group.

Result:

  return states[state][key]

builtins.KeyError: 'ANY_STR'

During handling of the above exception, another exception occurred:

 File "C:\Users\FruitfulApproach\Desktop\BananaCats_\language\user_input_parser.py", line 49, in <module>
  result = parser.parse_to_list(s)
 File "C:\Users\FruitfulApproach\Desktop\BananaCats_\language\user_input_parser.py", line 41, in parse_to_list
  return self._parser.parse(s)
File "c:\Python36-32\Lib\site-packages\lark\lark.py", line 197, in parse
  return self.parser.parse(text)
File "c:\Python36-32\Lib\site-packages\lark\parser_frontends.py", line 37, in parse
  return self.parser.parse(token_stream)
File "c:\Python36-32\Lib\site-packages\lark\parsers\lalr_parser.py", line 71, in parse
  action, arg = get_action(token.type)
File "c:\Python36-32\Lib\site-packages\lark\parsers\lalr_parser.py", line 50, in get_action
  raise UnexpectedToken(token, expected, seq, i)

lark.common.UnexpectedToken: Unexpected token Token(ANY_STR, 'G') at line 1, column 5.
Expected: dict_keys(['variable', 'LATIN', 'integer', '__ANONSTR_1', '__anon_plus_1', 'inner_latex', 'GREEK_LOWER', 'GREEK_UPPER', 'any_str_not_var', 'ANY_STR_NOT_VAR', 'greek', 'SIGNED_INT', 'text_block'])
Context: <no context>

I will code on another piece of the puzzle (BananaCats application) while I fix these issues here. Also any suggestions on how I should structure this grammar are welcome. I took note of the "use raw strings" comment and it seems to be compiling with single \\ used. Thanks for that! @erezsh

MegaIng commented 5 years ago

To fix this issue, use lexer='contextual as argument to Lark. ( I though that would be default, @erezsh?)

(Btw, this currently will not correctly recognize the greek letters.)

enjoysmath commented 5 years ago

To fix this issue, use lexer='contextual as argument to Lark. ( I though that would be default, @erezsh?)

(Btw, this currently will not correctly recognize the greek letters.)

Okay, I will work on it some and post back. Thank you for helping BananaCats development!

enjoysmath commented 5 years ago
from lark import Lark, Transformer

class UserInputParser:      
   grammar = r'''
      ?start: string
      ?string: (latex_string | any_str)+
      ?latex_string: block_latex
                   | inline_latex
      ?inline_latex: "$" inner_latex+ "$"
      ?block_latex: "$$" inner_latex+ "$$"
      ?inner_latex: variable 
                  | integer
                  | text_block
                  | any_str_not_var
      variable: latin 
              | greek      
      ?text_block: TEXT_CMD "{" ANY_STR "}"
      integer: SIGNED_INT
      ?greek: GREEK_LOWER     -> greek
         | GREEK_UPPER     -> greek
      GREEK_LOWER: /\alpha|\beta|\gamma|\delta|\epsilon|\zeta|\eta|\theta/
         | /\iota|\kappa|\lambda|\mu|\xi|\omicron|\pi|\rho|\sigma/
         | /\tau|\upsilon|\phi|\psi|\chi|\omega/
      GREEK_UPPER: /\Alpha|\Beta|\Gamma|\Delta|\Epsilon|\Zeta|\Eta|\Theta/
         | /\Iota|\Kappa|\Lambda|\Mu|\Xi|\Omicron|\Pi|\Rho|\Sigma/
         | /\Tau|\Upsilon|\Phi|\Psi|\Chi|\Omega/
      TEXT_CMD: /\text|\textbf|\operatorname/
      ?latin: LATIN
      LATIN: /[a-zA-Z]/
      any_str: ANY_STR
      ANY_STR: /[^{}$]+/
      any_str_not_var: ANY_STR_NOT_VAR
      ANY_STR_NOT_VAR: /[^{}$a-zA-Z]+/
      %import common.SIGNED_INT
      %import common.WS_INLINE
      %ignore WS_INLINE
   '''

   def __init__(self):
      self._parser = Lark(self.grammar, parser='lalr', lexer='contextual')

   def parse_to_list(self, s:str) -> list:
      return self._parser.parse(s)

if __name__ == '__main__':
   parser = UserInputParser()

   while True:
      s = input("s=")
      result = parser.parse_to_list(s)
      print(result)

Compiling leads to error before run:

File "c:\Python36-32\Lib\site-packages\lark\load_grammar.py", line 327, in _fix_escaping s = literal_eval(to_eval) File "c:\Python36-32\Lib\ast.py", line 48, in literal_eval node_or_string = parse(node_or_string, mode='eval') File "c:\Python36-32\Lib\ast.py", line 35, in parse return compile(source, filename, mode, PyCF_ONLY_AST) File "", line 1, in ?

Syntax Error: (unicode error) 'unicodeescape' codec can't decode bytes in position 8-9: truncated \uXXXX escape: , line 1, pos 0

During handling of the above exception, another exception occurred:

 File "C:\Users\FruitfulApproach\Desktop\BananaCats_\language\user_inputparser.py", line 47, in parser = UserInputParser()  File "C:\Users\FruitfulApproach\Desktop\BananaCats\language\user_input_parser.py", line 40, in init self._parser = Lark(self.grammar, parser='lalr', lexer='contextual') File "c:\Python36-32\Lib\site-packages\lark\lark.py", line 156, in init tokens, self.rules, self.ignore_tokens = self.grammar.compile(lexer=bool(lexer), start=self.options.start) File "c:\Python36-32\Lib\site-packages\lark\load_grammar.py", line 490, in compile for name, (token_tree, priority) in token_defs] File "c:\Python36-32\Lib\site-packages\lark\load_grammar.py", line 490, in for name, (token_tree, priority) in token_defs] File "c:\Python36-32\Lib\site-packages\lark\tree.py", line 139, in transform tree = t.transform(tree) File "c:\Python36-32\Lib\site-packages\lark\tree.py", line 113, in transform items.append(self.transform(c) if isinstance(c, Tree) else c) File "c:\Python36-32\Lib\site-packages\lark\tree.py", line 113, in transform items.append(self.transform(c) if isinstance(c, Tree) else c) File "c:\Python36-32\Lib\site-packages\lark\tree.py", line 121, in transform return f(items) File "c:\Python36-32\Lib\site-packages\lark\utils.py", line 72, in _f return f.func(self, *args) File "c:\Python36-32\Lib\site-packages\lark\load_grammar.py", line 356, in literal return T('pattern', [_literal_to_pattern(literal)]) File "c:\Python36-32\Lib\site-packages\lark\load_grammar.py", line 345, in _literal_to_pattern s = _fix_escaping(x) File "c:\Python36-32\Lib\site-packages\lark\load_grammar.py", line 329, in _fix_escaping raise ValueError(s, e)

builtins.ValueError: ('\\tau|\\upsilon|\\phi|\\psi|\\chi|\\omega', SyntaxError("(unicode error) 'unicodeescape' codec can't decode bytes in position 8-9: truncated \\uXXXX escape", ('<unknown>', 1, 0, None)))

However, the single backslash works on \alpha, \beta etc.

MegaIng commented 5 years ago

No, it does not work corretly. The content inside of '//' is handled (almost) as if it would be between “ in normal python code. Therfore escape sequence like '\n' or '\b' are translated to their corresponding values. This silently works for all other sequences, and since python ignores unknown escape sequence, you wont get any errors till you come to '\u', which expects a specific format following it. So you should just keep it doubled.

erezsh commented 5 years ago

Yes, contextual lexer is the default.

But my advice is - get it to work with Earley first.

enjoysmath commented 5 years ago

No, it does not work corretly. The content inside of '//' is handled (almost) as if it would be between “ in normal python code. Therfore escape sequence like '\n' or '\b' are translated to their corresponding values. This silently works for all other sequences, and since python ignores unknown escape sequence, you wont get any errors till you come to '\u', which expects a specific format following it. So you should just keep it doubled.

With the above-last example and \upsilon switched to \\upsilon I get errors:

File "c:\Python36-32\Lib\site-packages\lark\lexer.py", line 230, in __init__
  lexer = lexer_by_tokens[key]

builtins.KeyError: frozenset({'TEXT_CMD', 'greek', 'LATIN', 'text_block', 'SIGNED_INT', 'GREEK_UPPER', 'integer', 'GREEK_LOWER', 'inner_latex', 'variable', '__anon_plus_1', 'latin', 'any_str_not_var', 'ANY_STR_NOT_VAR'})

During handling of the above exception, another exception occurred:

File "c:\Python36-32\Lib\site-packages\lark\lexer.py", line 188, in __init__
  re.compile(t.pattern.to_regexp())
File "c:\Python36-32\Lib\re.py", line 233, in compile
  return _compile(pattern, flags)
File "c:\Python36-32\Lib\re.py", line 301, in _compile
  p = sre_compile.compile(pattern, flags)
File "c:\Python36-32\Lib\sre_compile.py", line 562, in compile
  p = sre_parse.parse(p, flags)
File "c:\Python36-32\Lib\sre_parse.py", line 855, in parse
  p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, 0)
File "c:\Python36-32\Lib\sre_parse.py", line 416, in _parse_sub
  not nested and not items))
File "c:\Python36-32\Lib\sre_parse.py", line 502, in _parse
  code = _escape(source, this, state)
File "c:\Python36-32\Lib\sre_parse.py", line 401, in _escape
  raise source.error("bad escape %s" % escape, len(escape))

sre_constants.error: bad escape \o at position 12

During handling of the above exception, another exception occurred:

 File "C:\Users\FruitfulApproach\Desktop\BananaCats_\language\user_input_parser.py", line 47, in <module>
  parser = UserInputParser()
 File "C:\Users\FruitfulApproach\Desktop\BananaCats_\language\user_input_parser.py", line 40, in __init__
  self._parser = Lark(self.grammar, parser='lalr', lexer='contextual')
File "c:\Python36-32\Lib\site-packages\lark\lark.py", line 161, in __init__
  self.parser = self._build_parser()
File "c:\Python36-32\Lib\site-packages\lark\lark.py", line 184, in _build_parser
  return self.parser_class(self.lexer_conf, parser_conf, options=self.options)
File "c:\Python36-32\Lib\site-packages\lark\parser_frontends.py", line 43, in __init__
  self.init_contextual_lexer(lexer_conf, parser_conf)
File "c:\Python36-32\Lib\site-packages\lark\parser_frontends.py", line 20, in init_contextual_lexer
  self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept, user_callbacks=lexer_conf.callbacks)
File "c:\Python36-32\Lib\site-packages\lark\lexer.py", line 234, in __init__
  lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
File "c:\Python36-32\Lib\site-packages\lark\lexer.py", line 190, in __init__
  raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))

lark.lexer.LexError: Cannot compile token TEXT_CMD: '\text|\textbf|\operatorname'

erezsh commented 5 years ago

You still need double backslash...

enjoysmath commented 5 years ago

Thank you! :D

On Fri, Nov 22, 2019 at 10:50 PM Erez Shinan notifications@github.com wrote:

You still need double backslash...

— You are receiving this because you modified the open/close state. Reply to this email directly, view it on GitHub https://github.com/lark-parser/lark/issues/484?email_source=notifications&email_token=AAMIF52A6NMPESKN4QX4QBDQVDHEVA5CNFSM4JQGIKYKYY3PNVWWK3TUL52HS4DFVREXG43VMVBW63LNMVXHJKTDN5WW2ZLOORPWSZGOEE7PBMQ#issuecomment-557772978, or unsubscribe https://github.com/notifications/unsubscribe-auth/AAMIF543FSQAVLADPRQNL23QVDHEVANCNFSM4JQGIKYA .