grammar.json is needed to map from node.type to node.kind_id
assuming that node.type is more stable across different versions of a parser
the extra files should be stored in the filesystem to save memory
# TODO better? get name-id mappings from parser binary?
import json
with open(os.environ["TREE_SITTER_HTML_SRC"] + "/src/grammar.json", "r") as f:
tree_sitter_html_grammar = json.load(f)
# no. names can be ugly names like '"'
# import types
# node_kind = types.SimpleNamespace(**{
# name: id for id, name in enumerate(tree_sitter_html_grammar["rules"])
# })
# print("node_kind.document", node_kind.document)
node_kind = {
name: id for id, name in enumerate(tree_sitter_html_grammar["rules"])
}
print("node_kind document", node_kind["document"])
TODO better? get name-id mappings from parser binary?
probably this should be fixed upstream in tree-sitter
edit: tree_sitter_html_grammar["rules"] is wrong
i was looking for ts_symbol_identifiers and ts_symbol_names in src/parser.c
grammar.json
is needed to map fromnode.type
tonode.kind_id
assuming thatnode.type
is more stable across different versions of a parserthe extra files should be stored in the filesystem to save memory
probably this should be fixed upstream in tree-sitter
edit:
tree_sitter_html_grammar["rules"]
is wrong i was looking forts_symbol_identifiers
andts_symbol_names
insrc/parser.c
parsing
src/parser.c
is a bit more thanjson.load
...parse_parser_c.py
```py import ast import tree_sitter_languages with open(os.environ["TREE_SITTER_HTML_SRC"] + "/src/parser.c", "rb") as f: parser_c_src = f.read() tree_sitter_c = tree_sitter_languages.get_parser("c") parser_c_tree = tree_sitter_c.parse(parser_c_src) def walk_tree(tree): cursor = tree.walk() reached_root = False while reached_root == False: yield cursor.node if cursor.goto_first_child(): continue if cursor.goto_next_sibling(): continue retracing = True while retracing: if not cursor.goto_parent(): retracing = False reached_root = True if cursor.goto_next_sibling(): retracing = False if False: # debug: print AST node_idx = 0 max_len = 30 for node in walk_tree(parser_c_tree.root_node): node_text = json.dumps(node.text.decode("utf8")) if len(node_text) > max_len: node_text = node_text[0:max_len] + "..." #pfx = "# " if is_compound else " " pfx = "" print(pfx + f"node {node.kind_id:2d} = {node.type:25s} : {node_text:30s}") node_idx += 1 #if node_idx > 100: break sys.exit() in_enum_ts_symbol_identifiers = False in_char_ts_symbol_names = False enum_name = None current_identifier = None enum_ts_symbol_identifiers = dict() char_ts_symbol_names = dict() for node in walk_tree(parser_c_tree.root_node): node_source = node.text.decode("utf8") if node.type == "type_identifier" and node.text == b"ts_symbol_identifiers": in_enum_ts_symbol_identifiers = True continue if node.type == "pointer_declarator" and node.text == b"* const ts_symbol_names[]": in_char_ts_symbol_names = True continue if in_enum_ts_symbol_identifiers: if node.type == "identifier": current_identifier = node_source continue if node.type == "number_literal": enum_ts_symbol_identifiers[current_identifier] = ( int(node_source) ) current_identifier = None continue if node.type == "}": current_identifier = node_source in_enum_ts_symbol_identifiers = False continue continue if in_char_ts_symbol_names: if node.type == "subscript_designator": current_identifier = node_source[1:-1] continue if node.type == "string_literal": char_ts_symbol_names[current_identifier] = ( ast.literal_eval(node_source) ) current_identifier = None continue if node.type == "}": current_identifier = node_source in_char_ts_symbol_names = False break continue #print("enum_ts_symbol_identifiers =", json.dumps(enum_ts_symbol_identifiers, indent=2)) #print("char_ts_symbol_names =", json.dumps(char_ts_symbol_names, indent=2)) # force user to use exact names from full_node_kind # names can collide when grammars # use the same names for different tokens... # example: # both the full tag and the tag_name have the token name "doctype" # sym_doctype = 26, // full doctype tag # sym__doctype = 4, // tag_name of doctype tag full_node_kind = enum_ts_symbol_identifiers node_kind = dict() for full_name, id in enum_ts_symbol_identifiers.items(): name = char_ts_symbol_names[full_name] if len(list(filter(lambda n: n == name, char_ts_symbol_names.values()))) > 1: # duplicate name # force user to use full_name in full_node_kind # also store full_name in node_kind node_kind[full_name] = id continue node_kind[name] = id # allow reverse lookup from id to name node_name = [None] + list(node_kind.keys()) #print("full_node_kind =", json.dumps(full_node_kind, indent=2)) print("node_kind =", json.dumps(node_kind, indent=2)) #print("node_kind document =", node_kind["document"]) ```alternative: parse a source that contains all possible node types and build the mapping from the
node.type
andnode.kind_id
valueskeywords: tree-sitter use numeric node types in scripting languages python javascript