Open milahu opened 8 months ago
does the is_named
property not help you here?
no, this does not help to diff the '>'
node kind_id= 3 type=> is_named=False '>' ' html>'
node kind_id= 3 type=> is_named=False '>' '>'
# node kind_id=25 type=fragment is_named=True '<!doctype html><hr>'
# node kind_id=26 type=doctype is_named=True '<!doctype html>'
node kind_id= 1 type=<! is_named=False '<!' '<!'
node kind_id= 4 type=doctype is_named=False 'doctype' 'doctype'
node kind_id= 3 type=> is_named=False '>' ' html>'
# node kind_id=28 type=element is_named=True '<hr>'
# node kind_id=31 type=start_tag is_named=True '<hr>'
node kind_id= 5 type=< is_named=False '<' '<'
node kind_id=17 type=tag_name is_named=True 'hr' 'hr'
node kind_id= 3 type=> is_named=False '>' '>'
currently i use the workaround
if node_type_id == 1 or node_type_id == 4:
in_doctype_node = True
elif node_type_id == 3 and in_doctype_node == True:
in_doctype_node = False
node_source = input_html[(last_node_to + 1):node.range.end_byte]
node_source_space_before = b""
last_node_to = node.range.start_byte - 1
low priority stuff...
im just surprised that the contents of '<!doctype html>'
dont show up in the parse tree
to compare:
lezer-parser-html produces only one node for '<!doctype html>'
node 15 = Document: '<!doctype html><hr>'
node 43 = DoctypeDecl: '<!doctype html>'
node 20 = Element: '<hr>'
node 38 = SelfClosingTag: '<hr>'
node 10 = StartTag: '<'
node 22 = TagName: 'hr'
node 4 = EndTag: '>'
input
result: compound nodes are prefixed with
#
problem: the
'html'
in'<!doctype html>'
has no parse node and the close tag'>'
of'<!doctype html>'
has the same node type as the close tag'>'
of'<hr>'
note how
' html'
spills into'>'
withnode_source = input_html[last_node_to:node.range.end_byte]
this is causing problems in a semantic stage using this parser where i want to ...
either ignore the compound node
'<!doctype html>'
and process its child nodes'<!'
and'doctype'
and'html'
and'>'
or process the compound node and ignore its child nodes
the cheap solution would be to use a different node type for
'>'
of'<!doctype html>'
""" def walk_callback(node, is_compound): nonlocal walk_html_tree_test_result, last_node_to s = repr(node.text.decode("utf8")) if len(s) > 50: s = s[0:50] + "..." if not is_compound: node_source = input_html[last_node_to:node.range.end_byte] last_node_to = node.range.end_byte node_source = node_source.decode("utf8") if len(node_source) > 50: node_source = node_source[0:50] + "..." print(f"node {node.kind_id} = {node.type}: {s} -> {repr(node_source)}") else: print(f"# node {node.kind_id} = {node.type}: {s}") import tree_sitter import tree_sitter_languages tree_sitter_html = tree_sitter_languages.get_parser("html") html_parser = tree_sitter_html html_tree = html_parser.parse(input_html) top_node = html_tree.root_node walk_html_tree(top_node, walk_callback) ```