Terrible performances on medium and big files

Harmos274 commented 3 years ago

Good morning,

I like tree-sitter haskell very much but it seems it considerably slows when a file pass a certain number of characters. I don't actually know if the cause is the file's pattern complexity or anything but this is very penalizing...

Here's an example of slow file if you wan't to reproduce it :

module Evaluator
    ( evaluate,
      evaluateRepl,
      evaluateDefines,
      Value (..),
      Context,
    ) where

import Text.Read (readMaybe)
import Data.Maybe (fromMaybe)
import Control.Exception (throw)

import qualified Data.Map.Strict as Map

import Parser (Expression (..))
import Exception (HExceptions (EvaluationException))

type Context = Map.Map String Value

data Function = Defined [String] Expression | Builtin ([Value] -> Value) | Spe (Context -> [Expression] -> Value)
data Value = Function Function | Number Int | String String | List [Value] | Nil

instance Show Value where
    show (Function _) = "#<procedure>"
    show (Number n)   = show n
    show (String s)   = s
    show (List   l)   = Evaluator.showList l
    show Nil          = "()"

showList :: [Value] -> String
showList []         = "()"
showList [x, Nil]   = '(' : show x ++ ")"
showList (first:xs) = '(' : show first ++ showList' xs

showList' :: [Value] -> String
showList' [v, Nil] = (' ': show v) ++ ")"
showList' [v]      = (" . " ++ show v) ++ ")"
showList' (v:xs)   = (' ' : show v) ++ showList' xs
showList' []       = ")"

evaluateDefines :: [Expression] -> Context
evaluateDefines = evaluateDefines' baseContext

evaluateDefines' :: Context -> [Expression] -> Context
evaluateDefines' c []                                  = c
evaluateDefines' c (Seq (Atom "define" : define) : xs) = evaluateDefines' (fst $ evaluateDefine c define) xs
evaluateDefines' c (_                            : xs) = evaluateDefines' c xs

evaluate :: [Expression] -> [Value]
evaluate = evaluate' baseContext

evaluate' :: Context -> [Expression] -> [Value]
evaluate' _ []                                  = []
evaluate' c (Seq (Atom "define" : define) : xs) = evaluate' (fst $ evaluateDefine c define) xs
evaluate' c (expr:xs)                           = evaluateExpr c expr : evaluate' c xs

evaluateRepl :: Context -> [Expression] -> (Context, [Value])
evaluateRepl = evaluateRepl' []

evaluateRepl' :: [Value] -> Context -> [Expression] -> (Context, [Value])
evaluateRepl' v c []                                  = (c, reverse v)
evaluateRepl' v c (Seq (Atom "define" : define) : xs) = evaluateRepl'' v xs $ evaluateDefine c define
evaluateRepl' v c (expr:xs)                           = evaluateRepl' (evaluateExpr c expr : v) c xs

evaluateRepl'' :: [Value] -> [Expression] -> (Context, String) -> (Context, [Value])
evaluateRepl'' v (expr:xs) (c, name) = evaluateRepl' (evaluateExpr c expr : String name : v) c xs
evaluateRepl'' v []        (c, name) = (c, reverse $ String name : v)

evaluateDefine :: Context -> [Expression] -> (Context, String)
evaluateDefine c [Atom symbol, expr]              = (Map.insert symbol (evaluateExpr c expr) c, symbol)
evaluateDefine c [Seq (Atom symbol : args), func] = (Map.insert symbol (createFunction args func) c, symbol)
evaluateDefine _ _                                = throw $ EvaluationException "define : Invalid arguments"

createFunction :: [Expression] -> Expression -> Value
createFunction args func = Function $ Defined (map asAtom args) func

evaluateExpr :: Context -> Expression -> Value
evaluateExpr _ (Quoted expr) = evaluateQuoted expr
evaluateExpr c (Seq exprs)   = evaluateSeq c exprs
evaluateExpr c (Atom a)      = evaluateAtom c a

evaluateAtom :: Context -> String -> Value
evaluateAtom c s = Map.lookup s c
                ?: ((Number <$> readMaybe s)
                ?: throw (EvaluationException (show s ++ " is not a variable")))

evaluateSeq :: Context -> [Expression] -> Value
evaluateSeq _ []        = Nil
evaluateSeq c (expr:xs) = evaluateSeq' c (evaluateExpr c expr) xs

evaluateSeq' :: Context -> Value -> [Expression] -> Value
evaluateSeq' c (Function (Spe s)) exprs = s c exprs
evaluateSeq' c v exprs                  = evaluateSeq'' c $ v:map (evaluateExpr c) exprs

evaluateSeq'' :: Context -> [Value] -> Value
evaluateSeq'' c (Function f : xs) = invokeFunction c f xs
evaluateSeq'' _ []                = Nil
evaluateSeq'' _ _                 = throw $ EvaluationException "Sequence is not a procedure"

evaluateQuoted :: Expression -> Value
evaluateQuoted (Atom a)   = evaluateQuotedAtom a
evaluateQuoted (Seq  [])  = Nil
evaluateQuoted (Seq  q)   = List $ evaluateQuotedSeq q
evaluateQuoted (Quoted q) = evaluateQuoted q

evaluateQuotedAtom :: String -> Value
evaluateQuotedAtom s = (Number <$> readMaybe s) ?: String s

evaluateQuotedSeq :: [Expression] -> [Value]
evaluateQuotedSeq = foldr ((:) . evaluateQuoted) [Nil]

invokeFunction :: Context -> Function -> [Value] -> Value
invokeFunction _ (Builtin b)            args = b args
invokeFunction c (Defined symbols func) args = evaluateExpr (functionContext c symbols args) func
invokeFunction _ (Spe _)                _    = throw $ EvaluationException "The impossible has happened"

functionContext :: Context -> [String] -> [Value] -> Context
functionContext c (symbol:sxs) (value:vxs) = functionContext (Map.insert symbol value c) sxs vxs
functionContext c []           []          = c
functionContext _ _            _           = throw $ EvaluationException "Invalid number of arguments"

baseContext :: Context
baseContext = Map.fromList builtins

builtins :: [(String, Value)]
builtins = [("+",      Function $ Builtin add),
            ("-",      Function $ Builtin sub),
            ("*",      Function $ Builtin mult),
            ("div",    Function $ Builtin division),
            ("mod",    Function $ Builtin modulo),
            ("<",      Function $ Builtin inferior),
            ("eq?",    Function $ Builtin eq),
            ("atom?",  Function $ Builtin atom),
            ("cons",   Function $ Builtin cons),
            ("car",    Function $ Builtin car),
            ("cdr",    Function $ Builtin cdr),
            ("cond",   Function $ Spe cond),
            ("lambda", Function $ Spe lambda),
            ("let"   , Function $ Spe slet),
            ("quote" , Function $ Spe quote),
            ("#t" ,    String "#t"),
            ("#f" ,    String "#f")
           ]

add :: [Value] -> Value
add = Number . sum . map asNumber

sub :: [Value] -> Value
sub [Number n]       = Number $ -n
sub (Number n:xs)    = Number $ foldl (-) n $ map asNumber xs
sub _                = throw $ EvaluationException "- : Invalid arguments"

mult :: [Value] -> Value
mult = Number . product . map asNumber

division :: [Value] -> Value
division [Number lhs, Number rhs] = Number $ quot lhs rhs
division [_         , _]          = throw $ EvaluationException "div : Invalid arguments"
division _                        = throw $ EvaluationException "div : Invalid number of arguments"

modulo :: [Value] -> Value
modulo [Number lhs, Number rhs] = Number $ mod lhs rhs
modulo [_         , _]          = throw $ EvaluationException "mod : Invalid arguments"
modulo _                        = throw $ EvaluationException "mod : Invalid number of arguments"

inferior :: [Value] -> Value
inferior [Number lhs, Number rhs] = fromBool $ (<) lhs rhs
inferior [_         , _]          = throw $ EvaluationException "< : Invalid arguments"
inferior _                        = throw $ EvaluationException "< : Invalid number of arguments"

cons :: [Value] -> Value
cons [List l, Nil] = List l
cons [lhs, List l] = List $ lhs:l
cons [lhs, rhs]    = List [lhs, rhs]
cons _             = throw $ EvaluationException "cons : Invalid number of arguments"

car :: [Value] -> Value
car [List (f : _)] = f
car _              = throw $ EvaluationException "car : Invalid arguments"

cdr :: [Value] -> Value
cdr [List [_, v]]  = v
cdr [List (_ : l)] = List l
cdr _              = throw $ EvaluationException "cdr : Invalid arguments"

cond :: Context -> [Expression] -> Value
cond c (Seq [expr, ret] : xs) = cond' c (evaluateExpr c expr) ret xs
cond _ _                      = throw $ EvaluationException "cond : invalid branch"

cond' :: Context -> Value -> Expression -> [Expression] -> Value
cond' c (String "#f") _   xs = cond c xs
cond' c _             ret _  = evaluateExpr c ret

eq :: [Value] -> Value
eq [Number lhs, Number rhs] | lhs == rhs = fromBool True
eq [String lhs, String rhs] | lhs == rhs = fromBool True
eq [Nil       , Nil       ]              = fromBool True
eq [_         , _         ]              = fromBool False
eq _                                     = throw $ EvaluationException "eq? : Invalid number of arguments"

atom :: [Value] -> Value
atom []       = throw $ EvaluationException "atom? : no argument"
atom [List _] = fromBool False
atom _        = fromBool True

lambda :: Context -> [Expression] -> Value
lambda _ [args, func] = lambda' args func
lambda _ _            = throw $ EvaluationException "lambda : Invalid number of arguments"

lambda' :: Expression -> Expression -> Value
lambda' (Seq args) func = Function $ Defined (map asAtom args) func
lambda' _ _             = throw $ EvaluationException "lambda : Invalid arguments"

slet :: Context -> [Expression] -> Value
slet c [Seq defs, expr] = evaluateExpr (letContext c defs) expr
slet _ _                = throw $ EvaluationException "let : Invalid number of arguments"

letContext :: Context -> [Expression] -> Context
letContext c (Seq [Atom key, value] : xs) = letContext (Map.insert key (evaluateExpr c value) c) xs
letContext c []                           = c
letContext _ _                            = throw $ EvaluationException "let : Invalid variable declaration"

quote :: Context -> [Expression] -> Value
quote _ [expr] = evaluateQuoted expr
quote _ _      = throw $ EvaluationException "quote : Invalid arguments"

fromBool :: Bool -> Value
fromBool True  = String "#t"
fromBool False = String "#f"

asAtom :: Expression -> String
asAtom (Atom a) = a
asAtom _        = throw $ EvaluationException "Invalid atom"

asNumber :: Value -> Int
asNumber (Number n) = n
asNumber v          = throw $ EvaluationException $ show v ++ " is not a number"

(?:) :: Maybe a -> a -> a
(?:) = flip fromMaybe

Configuration:

Nixos
Nvim 0.6.0-dev (upstream)
Tree-sitter 0.19.3

Thank you for your help !

tek commented 2 years ago

thanks @JonathanLorimer ! yes, gcc

avery-laird commented 2 years ago

First, a disclaimer: this is not exactly the area I work on, so please take whatever I say with a grain of salt! From a glance over the source code, I'll write my 2 cents.

Generally, std::function is very heavy weight. In some cases, compilers can do optimizations, but often they don't bother because it requires lots of engineering effort and analyses for (usually) little performance benefit.

So, unfortunately, I think it will be difficult to get good performance using std::function. There are two options I can think of, if you want to keep a functional style:

Using templates, embed a DSL that will be completely inlined at compile time. This might increase code size, but means allocations for closures will be completely eliminated. It seems possibly more suited to your use case. A very good example of how to do this is here.
Symbolic tree-like structures can be made efficient in C++ with careful heap management and memoization. The best example I can think of right now is the ScalarEvolution analysis in LLVM. The strategy here is to build structures bottom-up, and guard all calls/queries with a lookup in a cache. This is very hard to get right, because the cache can become poisoned in tricky ways. Additionally, allocations still occur.

I hope this is helpful!

tek commented 2 years ago

thanks a lot, I will take a close look at those!

414owen commented 2 years ago

Here are some pretty simple changes that halve the time taken by the scanner: https://github.com/tree-sitter/tree-sitter-haskell/compare/master...414owen:faster-scanner-demo?expand=1

I'm going to keep going, replacing combinators with first-order logic, and see what the results look like.

tek commented 2 years ago

that is crazy

tek commented 2 years ago

while performance is now pretty good on medium files, trying to insert code into base's Data.Map.Internal (at >4k lines) is still almost impossible. however, the question that is still unanswered is whether the apparent reparsing of the entire file on each keystroke is the result of incorrect design of this grammar or an nvim thing (how does this behave in helix, @414owen ?)

414owen commented 2 years ago

Performance in helix is also terrible when editing Data.Map.Internal.

tek commented 2 years ago

maybe incremental parsing only works reliably when the scanner is simple :thinking:

414owen commented 2 years ago

@maxbrunsfeld I don't suppose we could get your expert opinion here?

The state of affairs:

We have a slow external scanner
We think that its slowness should only matter when parsing a whole file
When making one-character edits on large files, it's painfully slow (it's okay on smaller files)

Does this indicate something's wrong? Are there steps that make the scanner incremental, that we might have missed?

See https://github.com/tree-sitter/tree-sitter-haskell/issues/41#issuecomment-950424044 for a more in-depth hypothesis.

luc-tielen commented 2 years ago

Also ran in this today, for this file.

my impression so far is that since std::function is an object that stores all of its closure's captured variables, and most of those variables are again functions, and all of those functions are stack-allocated in other parser objects, there's just a lot of copying and allocations going on, especially when, as you noted, the parsers have value parameters like Symbolic::type and the current indent. std::function is probably not all that suited for functional programming

No it's not unfortunately :disappointed:. std::function has a lot of overhead compared to a normal function that is called. There's the overhead of malloc, but also you have "pointer-chasing code", which is bad for cache-hits.

I would not write the C++ in a functional style, and go for a imperative/procedural solution with mutable state (even though it pains me to say so..). Try to avoid malloc completely, resort to variables on the stack (or pre-allocate memory).

But it looks like @414owen is already on it's way to improve the current parser. :smiley: If you need a reviewer, let me know.

414owen commented 2 years ago

@luc-tielen The imperative scanner changes work, and are ready for review. There are a few things I still want to do (eg. make state global rather than pass it around), but yeah any tips you have would be appreciated.

JonathanLorimer commented 2 years ago

Just wanted to say thanks to @414owen, @luc-tielen, and @tek for working to improve this tree sitter lib. I am really excited to switch this back on.

luc-tielen commented 2 years ago

@414owen I reviewed all your changes. I liked your approach of tiny commits. :) Unfortunately I was a little too slow and @tek already merged in your PR :P. Could you take a look at the comments (and maybe submit a part 3 PR?).

tek commented 2 years ago

sorry :grimacing:

luc-tielen commented 2 years ago

No worries! These changes are already a big improvement (and I didn't see anything wrong with the new code).

maxbrunsfeld commented 2 years ago

The way that I'd debug this is that I'd set up a small example file, and parse the file using Tree-sitter's -D/--debug-graph argument, which generates a complete report of the parsing process. Also, Tree-sitter allows you to simulate an edit to the file from the command line, and re-parse the file after the edit.

Create a small example file with some declarations sampled from Map/Base.hs

-- test.hs
elemAt :: Int -> Map k a -> (k,a)a
take :: Int -> Map k a -> Map k a
drop :: Int -> Map k a -> Map k a

Parse the file from the command line, generating debug output (requires that dot from the graphviz package is present on your PATH):
```
tree-sitter parse test.hs --debug-graph
```
From this graph, you can see when Tree-sitter is processing an ambiguity, because the parse stack will be "forked".
Re-parse the file incrementally after inserting the character 'x' at the beginning of line 2. The syntax for the --edit argument is either position bytes_removed string_inserted, where position can be either a byte offset or a row and a column, separated by a comma. Here I use the latter:
```
tree-sitter parse test.rs --edit '2,0 0 x'  --debug-graph
```

My guess is that the reason for the poor incremental parsing performance is the ambiguity in the grammar, not the scanner. But I'm not certain.

tek commented 2 years ago

ah, I never thought of looking at the graph when doing an edit, thanks for the tip

414owen commented 2 years ago

Hmm, on the recomputation side, if I'm interpreting it correctly, this line returns the amount of elements copied, rather than the amount of bytes (as the docs suggest).

I realise that we're the main consumers of the number returned, in deserialize, but I'm wondering if our state gets corrupted...

tek commented 2 years ago

:grimacing: that sounds plausible. curious that this hasn't caused any serious errors

414owen commented 2 years ago

Using the --debug-graph flags, I see a bunch of

cant_reuse_node_is_fragile tree:function
cant_reuse_node_is_fragile tree:_funlhs
cant_reuse_node_is_fragile tree:_funvar
cant_reuse_node_is_fragile tree:_fun_name
...etc

One for every function in my test file.

@maxbrunsfeld is fragility documented anywhere? If a rule is fragile, are all of its parent rules marked as fragile too? Is there a way to get a list of all fragile grammar rules?

maxbrunsfeld commented 2 years ago

There are a couple of reasons that a node can be marked as fragile. Certain parse states are statically known to be fragile, because of precedence usage. But mainly, nodes are also marked as fragile if they are created while the parse stack is forked due to a conflict. I think that conflicts are likely the culprit here.

tek commented 2 years ago

oh yeah there are lots of conflicts in the grammar

maxbrunsfeld commented 2 years ago

It's generally fine for there to be a lot of conflicts: usually, the "fragile" nodes that are created during conflicts are fully contained within some more stable structure, so that a lot of the syntax tree remains reusable.

I think it becomes a problem when the conflicts commonly occur at the top-most level of the source-file, so that there is no "non-fragile" nodes that can be reused. If someone wants to work on improving this, I would suggest looking into what conflicts are arising at the top-most level, when parsing the outer structure of a declaration or a function.

If there is some specific Haskell language extension that is causing these conflicts, it may be worthwhile to scale back the support for that extension, in the case of top-level constructs, so that we can reliably get incremental parsing at the topmost level of a source file.

tek commented 2 years ago

template haskell introduces top level splices, which account for almost half of all conflicts. unfortunately, this is a very common feature, I don't think it would be feasible to deactivate :disappointed:

maxbrunsfeld commented 2 years ago

Ah yeah, I can see how that would cause top-level conflicts. I'm not sure what to do about that. What percentage of Haskell source files use top-level splices without the $() syntax? Unfortunately, it seems like we may have to choose between either supporting that language feature, or getting usable performance in text editors like Neovim.

tek commented 2 years ago

in my code, it's usually maybe 20%, not sure. almost all of those are of the shape

someFunc ''SomeType

so I would speculate that we could support this highly specialized variant as an unambiguous variant of the type signature rule without losing too much – hoping that the other conflicts have much less of an impact. What I don't understand is what that would mean for files containing unsupported TH – can we achieve that it would be skipped without poisoning the rest of the file?

maxbrunsfeld commented 2 years ago

I like that idea.

What I don't understand is what that would mean for files containing unsupported TH – can we achieve that it would be skipped without poisoning the rest of the file?

I don't think it would ruin the parsing of the whole remainder of the file. I think you'd often get fairly small ERROR nodes in the vicinity of the splice, but other parts of the file would parse fine. I'm not 100% positive how it would play out with this grammar though.

tek commented 2 years ago

looking at those TH conflicts, I see that there's also:

    [$.signature, $.pat_name],

which is actually disambiguating top decl signatures from equations. so this might just as well be responsible for the problem. however, just because that's the best I could do after my month of iterating on the grammar doesn't mean we can't find a conflict free version of this :)

tek commented 2 years ago

I also have the impression that this is a question of the trade-off between conflict freedom and precise semantic naming of top level nodes – you could imagine a tree that starts with (top_level_initial_varid instead of (top_splice or (signature, and then branches based on what follows

tek commented 2 years ago

performance is now stellar!

tree-sitter / tree-sitter-haskell

Terrible performances on medium and big files #41