grantjenks / py-tree-sitter-languages

Binary Python wheels for all tree sitter languages.
Other
116 stars 35 forks source link

expose extra files of each language: grammar.json, node-types.json, highlights.scm, ... #59

Open milahu opened 3 months ago

milahu commented 3 months ago

grammar.json is needed to map from node.type to node.kind_id assuming that node.type is more stable across different versions of a parser

the extra files should be stored in the filesystem to save memory

# TODO better? get name-id mappings from parser binary?

import json
with open(os.environ["TREE_SITTER_HTML_SRC"] + "/src/grammar.json", "r") as f:
    tree_sitter_html_grammar = json.load(f)

# no. names can be ugly names like '"'
# import types
# node_kind = types.SimpleNamespace(**{
#     name: id for id, name in enumerate(tree_sitter_html_grammar["rules"])
# })
# print("node_kind.document", node_kind.document)

node_kind = {
    name: id for id, name in enumerate(tree_sitter_html_grammar["rules"])
}

print("node_kind document", node_kind["document"])

TODO better? get name-id mappings from parser binary?

probably this should be fixed upstream in tree-sitter

edit: tree_sitter_html_grammar["rules"] is wrong i was looking for ts_symbol_identifiers and ts_symbol_names in src/parser.c

enum ts_symbol_identifiers {
  anon_sym_LT_BANG = 1,
  aux_sym_doctype_token1 = 2,
  anon_sym_GT = 3,
static const char * const ts_symbol_names[] = {
  [ts_builtin_sym_end] = "end",
  [anon_sym_LT_BANG] = "<!",
  [aux_sym_doctype_token1] = "doctype_token1",
  [anon_sym_GT] = ">",

parsing src/parser.c is a bit more than json.load...

parse_parser_c.py ```py import ast import tree_sitter_languages with open(os.environ["TREE_SITTER_HTML_SRC"] + "/src/parser.c", "rb") as f: parser_c_src = f.read() tree_sitter_c = tree_sitter_languages.get_parser("c") parser_c_tree = tree_sitter_c.parse(parser_c_src) def walk_tree(tree): cursor = tree.walk() reached_root = False while reached_root == False: yield cursor.node if cursor.goto_first_child(): continue if cursor.goto_next_sibling(): continue retracing = True while retracing: if not cursor.goto_parent(): retracing = False reached_root = True if cursor.goto_next_sibling(): retracing = False if False: # debug: print AST node_idx = 0 max_len = 30 for node in walk_tree(parser_c_tree.root_node): node_text = json.dumps(node.text.decode("utf8")) if len(node_text) > max_len: node_text = node_text[0:max_len] + "..." #pfx = "# " if is_compound else " " pfx = "" print(pfx + f"node {node.kind_id:2d} = {node.type:25s} : {node_text:30s}") node_idx += 1 #if node_idx > 100: break sys.exit() in_enum_ts_symbol_identifiers = False in_char_ts_symbol_names = False enum_name = None current_identifier = None enum_ts_symbol_identifiers = dict() char_ts_symbol_names = dict() for node in walk_tree(parser_c_tree.root_node): node_source = node.text.decode("utf8") if node.type == "type_identifier" and node.text == b"ts_symbol_identifiers": in_enum_ts_symbol_identifiers = True continue if node.type == "pointer_declarator" and node.text == b"* const ts_symbol_names[]": in_char_ts_symbol_names = True continue if in_enum_ts_symbol_identifiers: if node.type == "identifier": current_identifier = node_source continue if node.type == "number_literal": enum_ts_symbol_identifiers[current_identifier] = ( int(node_source) ) current_identifier = None continue if node.type == "}": current_identifier = node_source in_enum_ts_symbol_identifiers = False continue continue if in_char_ts_symbol_names: if node.type == "subscript_designator": current_identifier = node_source[1:-1] continue if node.type == "string_literal": char_ts_symbol_names[current_identifier] = ( ast.literal_eval(node_source) ) current_identifier = None continue if node.type == "}": current_identifier = node_source in_char_ts_symbol_names = False break continue #print("enum_ts_symbol_identifiers =", json.dumps(enum_ts_symbol_identifiers, indent=2)) #print("char_ts_symbol_names =", json.dumps(char_ts_symbol_names, indent=2)) # force user to use exact names from full_node_kind # names can collide when grammars # use the same names for different tokens... # example: # both the full tag and the tag_name have the token name "doctype" # sym_doctype = 26, // full doctype tag # sym__doctype = 4, // tag_name of doctype tag full_node_kind = enum_ts_symbol_identifiers node_kind = dict() for full_name, id in enum_ts_symbol_identifiers.items(): name = char_ts_symbol_names[full_name] if len(list(filter(lambda n: n == name, char_ts_symbol_names.values()))) > 1: # duplicate name # force user to use full_name in full_node_kind # also store full_name in node_kind node_kind[full_name] = id continue node_kind[name] = id # allow reverse lookup from id to name node_name = [None] + list(node_kind.keys()) #print("full_node_kind =", json.dumps(full_node_kind, indent=2)) print("node_kind =", json.dumps(node_kind, indent=2)) #print("node_kind document =", node_kind["document"]) ```

alternative: parse a source that contains all possible node types and build the mapping from the node.type and node.kind_id values

keywords: tree-sitter use numeric node types in scripting languages python javascript