Closed milahu closed 8 months ago
Are you sure this is a py-tree-sitter issue? What about node-tree-sitter?
Are you sure this is a py-tree-sitter issue?
yepp, i suspect the conversion between native C API and python types is slow
What about node-tree-sitter?
not tested, maybe its also slower than lezer-parser
i prefer lezer-parser because its a pure javascript parser so no need to mess with WASM
node-tree-sitter is not WASM. web-tree-sitter is WASM.
yeah
still, its surprising that native code with script bindings is 10x slower than pure scripting code
so i guess there is something wrong with the bindings
Building bytes
objects using +=
is quadratic because bytes
are immutable, so each +=
has to copy the entire accumulated bytes
object. Are you sure that you’re not measuring that?
aah! good catch. let me rewrite that with io.BytesIO
Accumulating into a list and then bytes.join
ing would also work.
Could we use strings instead of bytes (or both)? Is there any chance we'll need to parse binary files?
html_parser.parse(input_html_bytes)
requires a bytestring
because tree-sitter does not know utf8
let me rewrite that with io.BytesIO
yepp. now py-tree-sitter is 2x faster than lezer-parser
sorry for the noise, thanks for the help : )
i was surprised to see that this parser is 10x slower than lezer-parser in javascript
this is surprising, because native code should be faster than scripting language
todo: write
fixme-py-tree-sitter-is-slow.c
to get the actual native speedtodo: test this with a "pure python" html parser
fixme-py-tree-sitter-is-slow.js
```js #!/usr/bin/env node // usage: // node test.js input.html import fs from 'fs'; import { parser as lezerParserHtml } from '@lezer/html'; // https://codereview.stackexchange.com/a/97886/205605 /** @param {Tree | TreeNode} tree */ function walkHtmlTree(tree, func) { const cursor = tree.cursor(); if (!cursor) return; let depth = 0; while (true) { // NLR: Node, Left, Right // Node // NOTE InvalidEntity breaks the parser // a&b&c // -> require valid input, throw on parse error const cursorTypeId = cursor.type.id; if ( //true || // debug: dont filter !( cursorTypeId == 15 || // Document cursorTypeId == 20 || // Element cursorTypeId == 23 || // Attribute cursorTypeId == 21 || // OpenTag cursorTypeId == 37 || // CloseTag cursorTypeId == 38 || // SelfClosingTag // note: this is inconsistent in the parser // InvalidEntity is child node // EntityReference is separate node (sibling of other text nodes) cursorTypeId == 19 || // InvalidEntity: "&" is parsed as InvalidEntity //cursorTypeId == 17 || // EntityReference: "&" or "—" is parsed as EntityReference false ) ) { func(cursor) } // Left if (cursor.firstChild()) { // moved down depth++; continue; } // Right if (depth > 0 && cursor.nextSibling()) { // moved right continue; } let continueMainLoop = false; let firstUp = true; while (cursor.parent()) { // moved up depth--; if (depth <= 0) { // when tree is a node, stop at the end of node // == dont visit sibling or parent nodes return; } if (cursor.nextSibling()) { // moved up + right continueMainLoop = true; break; } firstUp = false; } if (continueMainLoop) continue; break; } } function main() { const argv = process.argv.slice(1); // argv[0] is node const inputPath = argv[1]; if (!inputPath) { console.error(`error: missing argument inputPath`); console.error(`usage:`); console.error(` ${argv[0]} input.html`); return 1; } const inputHtml = fs.readFileSync(inputPath, 'utf8'); // note: lezer-parser is a CONCRETE syntax tree parser == CST parser const htmlParser = lezerParserHtml.configure({ strict: true, // throw on parse error }); const t1 = Date.now() / 1000; const htmlTree = htmlParser.parse(inputHtml); const rootNode = htmlTree.topNode; // note: always set this to zero before calling walkHtmlTree let lastNodeTo = 0; // test the tree walker // this test run should return // the exact same string as the input string // = lossless noop console.log(`testing walkHtmlTree in ${inputHtml.length} chars of html`) let walkHtmlTreeTestResult = ""; lastNodeTo = 0; walkHtmlTree(rootNode, (node) => { walkHtmlTreeTestResult += ( inputHtml.slice(lastNodeTo, node.to) ); lastNodeTo = node.to; }); const t2 = Date.now() / 1000; console.log(`testing walkHtmlTree done in ${t2 - t1} seconds`) if (walkHtmlTreeTestResult != inputHtml) { console.error('error: the tree walker is not lossless'); process.exit(1); } console.error(`ok. the tree walker is lossless`); } main() ```fixme-py-tree-sitter-is-slow.py
``` #!/usr/bin/env python3 # usage: # python3 test.py input.html import sys import time import tree_sitter import tree_sitter_languages tree_sitter_html = tree_sitter_languages.get_parser("html") # https://github.com/tree-sitter/py-tree-sitter/issues/33 def walk_html_tree(tree): # compound tags # these are ignored when serializing the tree compound_kind_id = ( 25, # fragment 26, # doctype 28, # element 29, # script_element 30, # style_element 31, # start_tag 34, # self_closing_tag 35, # end_tag 37, # attribute 38, # quoted_attribute_value ) cursor = tree.walk() reached_root = False while reached_root == False: is_compound = cursor.node.kind_id in compound_kind_id if not is_compound: yield cursor.node if cursor.goto_first_child(): continue if cursor.goto_next_sibling(): continue retracing = True while retracing: if not cursor.goto_parent(): retracing = False reached_root = True if cursor.goto_next_sibling(): retracing = False def main(): input_path = sys.argv[1] # tree-sitter expects binary string with open(input_path, 'rb') as f: input_html_bytes = f.read() html_parser = tree_sitter_html t1 = time.time() html_tree = html_parser.parse(input_html_bytes) root_node = html_tree.root_node # test the tree walker # this test run should return # the exact same string as the input string # = lossless noop print(f"testing walk_html_tree on {len(input_html_bytes)} bytes of html") walk_html_tree_test_result = b"" last_node_to = 0 for node in walk_html_tree(root_node): walk_html_tree_test_result += ( input_html_bytes[last_node_to:node.range.end_byte] ) last_node_to = node.range.end_byte # copy whitespace after last node # fix: missing newline at end of file walk_html_tree_test_result += ( input_html_bytes[last_node_to:] ) t2 = time.time() print(f"testing walk_html_tree done in {t2 - t1} seconds") assert walk_html_tree_test_result == input_html_bytes print('ok. the tree walker is lossless') main() ```