py-tree-sitter is 10x slower than lezer-parser

milahu commented 8 months ago

i was surprised to see that this parser is 10x slower than lezer-parser in javascript

this is surprising, because native code should be faster than scripting language

todo: write fixme-py-tree-sitter-is-slow.c to get the actual native speed

todo: test this with a "pure python" html parser

$ time ./fixme-py-tree-sitter-is-slow.js test.html
testing walkHtmlTree in 847454 chars of html
testing walkHtmlTree done in 1.2820000648498535 seconds
ok. the tree walker is lossless

real    0m1.983s
user    0m2.471s
sys     0m0.460s

$ time ./fixme-py-tree-sitter-is-slow.py test.html
testing walk_html_tree on 852413 bytes of html
testing walk_html_tree done in 18.54731035232544 seconds
ok. the tree walker is lossless

real    0m18.975s
user    0m8.877s
sys     0m9.505s

$ du -h test.html
836K    test.html

$ du -b test.html
852413  test.html

fixme-py-tree-sitter-is-slow.js

```js #!/usr/bin/env node // usage: // node test.js input.html import fs from 'fs'; import { parser as lezerParserHtml } from '@lezer/html'; // https://codereview.stackexchange.com/a/97886/205605 /** @param {Tree | TreeNode} tree */ function walkHtmlTree(tree, func) { const cursor = tree.cursor(); if (!cursor) return; let depth = 0; while (true) { // NLR: Node, Left, Right // Node // NOTE InvalidEntity breaks the parser // a&b&c // -> require valid input, throw on parse error const cursorTypeId = cursor.type.id; if ( //true || // debug: dont filter !( cursorTypeId == 15 || // Document cursorTypeId == 20 || // Element cursorTypeId == 23 || // Attribute cursorTypeId == 21 || // OpenTag cursorTypeId == 37 || // CloseTag cursorTypeId == 38 || // SelfClosingTag // note: this is inconsistent in the parser // InvalidEntity is child node // EntityReference is separate node (sibling of other text nodes) cursorTypeId == 19 || // InvalidEntity: "&" is parsed as InvalidEntity //cursorTypeId == 17 || // EntityReference: "&" or "—" is parsed as EntityReference false ) ) { func(cursor) } // Left if (cursor.firstChild()) { // moved down depth++; continue; } // Right if (depth > 0 && cursor.nextSibling()) { // moved right continue; } let continueMainLoop = false; let firstUp = true; while (cursor.parent()) { // moved up depth--; if (depth <= 0) { // when tree is a node, stop at the end of node // == dont visit sibling or parent nodes return; } if (cursor.nextSibling()) { // moved up + right continueMainLoop = true; break; } firstUp = false; } if (continueMainLoop) continue; break; } } function main() { const argv = process.argv.slice(1); // argv[0] is node const inputPath = argv[1]; if (!inputPath) { console.error(`error: missing argument inputPath`); console.error(`usage:`); console.error(` ${argv[0]} input.html`); return 1; } const inputHtml = fs.readFileSync(inputPath, 'utf8'); // note: lezer-parser is a CONCRETE syntax tree parser == CST parser const htmlParser = lezerParserHtml.configure({ strict: true, // throw on parse error }); const t1 = Date.now() / 1000; const htmlTree = htmlParser.parse(inputHtml); const rootNode = htmlTree.topNode; // note: always set this to zero before calling walkHtmlTree let lastNodeTo = 0; // test the tree walker // this test run should return // the exact same string as the input string // = lossless noop console.log(`testing walkHtmlTree in ${inputHtml.length} chars of html`) let walkHtmlTreeTestResult = ""; lastNodeTo = 0; walkHtmlTree(rootNode, (node) => { walkHtmlTreeTestResult += ( inputHtml.slice(lastNodeTo, node.to) ); lastNodeTo = node.to; }); const t2 = Date.now() / 1000; console.log(`testing walkHtmlTree done in ${t2 - t1} seconds`) if (walkHtmlTreeTestResult != inputHtml) { console.error('error: the tree walker is not lossless'); process.exit(1); } console.error(`ok. the tree walker is lossless`); } main() ```

fixme-py-tree-sitter-is-slow.py

``` #!/usr/bin/env python3 # usage: # python3 test.py input.html import sys import time import tree_sitter import tree_sitter_languages tree_sitter_html = tree_sitter_languages.get_parser("html") # https://github.com/tree-sitter/py-tree-sitter/issues/33 def walk_html_tree(tree): # compound tags # these are ignored when serializing the tree compound_kind_id = ( 25, # fragment 26, # doctype 28, # element 29, # script_element 30, # style_element 31, # start_tag 34, # self_closing_tag 35, # end_tag 37, # attribute 38, # quoted_attribute_value ) cursor = tree.walk() reached_root = False while reached_root == False: is_compound = cursor.node.kind_id in compound_kind_id if not is_compound: yield cursor.node if cursor.goto_first_child(): continue if cursor.goto_next_sibling(): continue retracing = True while retracing: if not cursor.goto_parent(): retracing = False reached_root = True if cursor.goto_next_sibling(): retracing = False def main(): input_path = sys.argv[1] # tree-sitter expects binary string with open(input_path, 'rb') as f: input_html_bytes = f.read() html_parser = tree_sitter_html t1 = time.time() html_tree = html_parser.parse(input_html_bytes) root_node = html_tree.root_node # test the tree walker # this test run should return # the exact same string as the input string # = lossless noop print(f"testing walk_html_tree on {len(input_html_bytes)} bytes of html") walk_html_tree_test_result = b"" last_node_to = 0 for node in walk_html_tree(root_node): walk_html_tree_test_result += ( input_html_bytes[last_node_to:node.range.end_byte] ) last_node_to = node.range.end_byte # copy whitespace after last node # fix: missing newline at end of file walk_html_tree_test_result += ( input_html_bytes[last_node_to:] ) t2 = time.time() print(f"testing walk_html_tree done in {t2 - t1} seconds") assert walk_html_tree_test_result == input_html_bytes print('ok. the tree walker is lossless') main() ```

ObserverOfTime commented 8 months ago

Are you sure this is a py-tree-sitter issue? What about node-tree-sitter?

milahu commented 8 months ago

Are you sure this is a py-tree-sitter issue?

yepp, i suspect the conversion between native C API and python types is slow

What about node-tree-sitter?

not tested, maybe its also slower than lezer-parser

i prefer lezer-parser because its a pure javascript parser so no need to mess with WASM

ObserverOfTime commented 8 months ago

node-tree-sitter is not WASM. web-tree-sitter is WASM.

milahu commented 8 months ago

yeah

still, its surprising that native code with script bindings is 10x slower than pure scripting code

so i guess there is something wrong with the bindings

narpfel commented 8 months ago

Building bytes objects using += is quadratic because bytes are immutable, so each += has to copy the entire accumulated bytes object. Are you sure that you’re not measuring that?

milahu commented 8 months ago

aah! good catch. let me rewrite that with io.BytesIO

narpfel commented 8 months ago

Accumulating into a list and then bytes.joining would also work.

ObserverOfTime commented 8 months ago

Could we use strings instead of bytes (or both)? Is there any chance we'll need to parse binary files?

milahu commented 8 months ago

html_parser.parse(input_html_bytes) requires a bytestring because tree-sitter does not know utf8

let me rewrite that with io.BytesIO

yepp. now py-tree-sitter is 2x faster than lezer-parser

fixed.py

```py #!/usr/bin/env python3 # usage: # python3 test.py input.html # https://github.com/tree-sitter/py-tree-sitter/issues/202 # py-tree-sitter is 10x slower than lezer-parser import sys import time import io import tree_sitter import tree_sitter_languages tree_sitter_html = tree_sitter_languages.get_parser("html") # https://github.com/tree-sitter/py-tree-sitter/issues/33 def walk_html_tree(tree): # compound tags # these are ignored when serializing the tree compound_kind_id = ( 25, # fragment 26, # doctype 28, # element 29, # script_element 30, # style_element 31, # start_tag 34, # self_closing_tag 35, # end_tag 37, # attribute 38, # quoted_attribute_value ) cursor = tree.walk() reached_root = False while reached_root == False: is_compound = cursor.node.kind_id in compound_kind_id if not is_compound: yield cursor.node if cursor.goto_first_child(): continue if cursor.goto_next_sibling(): continue retracing = True while retracing: if not cursor.goto_parent(): retracing = False reached_root = True if cursor.goto_next_sibling(): retracing = False def main(): input_path = sys.argv[1] # tree-sitter expects binary string with open(input_path, 'rb') as f: input_html_bytes = f.read() html_parser = tree_sitter_html t1 = time.time() html_tree = html_parser.parse(input_html_bytes) root_node = html_tree.root_node # test the tree walker # this test run should return # the exact same string as the input string # = lossless noop print(f"testing walk_html_tree on {len(input_html_bytes)} bytes of html") # slow! #walk_html_tree_test_result = b"" walk_html_tree_test_result = io.BytesIO() last_node_to = 0 for node in walk_html_tree(root_node): # walk_html_tree_test_result += ( # input_html_bytes[last_node_to:node.range.end_byte] # ) walk_html_tree_test_result.write( input_html_bytes[last_node_to:node.range.end_byte] ) last_node_to = node.range.end_byte # copy whitespace after last node # fix: missing newline at end of file # walk_html_tree_test_result += ( # input_html_bytes[last_node_to:] # ) walk_html_tree_test_result.write( input_html_bytes[last_node_to:] ) t2 = time.time() print(f"testing walk_html_tree done in {t2 - t1} seconds") walk_html_tree_test_result = walk_html_tree_test_result.getvalue() assert walk_html_tree_test_result == input_html_bytes print('ok. the tree walker is lossless') main() ```

sorry for the noise, thanks for the help : )

tree-sitter / py-tree-sitter

py-tree-sitter is 10x slower than lezer-parser #202