Open mpadge opened 3 years ago
This uses Davis Vaughan's implementation of tree-sitter in r, based on tree-walking code I've already implemented in pkgsimil
. Davis's version exposes the tree cursor directly in R, and does not expose the tree-sitter api.h
file, so all of these loops must be done in R rather than C. That makes this proof-of-principle only; any real implementations would have to be done in C like in pkgsimil
.
This code includes debugging lines switched off with cli_out <- FALSE
at the outset.
NA_to_null <- function (i) ifelse (is.na (i), "", i)
walk_one_tree <- function (tree) {
cli_out <- FALSE
it <- tree_walk (tree)
reached_foot <- FALSE
first_identifier <- TRUE
get_next_open <- FALSE
grammar_types <- node_text <- next_open <- fn_name <- character (0L)
while (!reached_foot) {
field_name <- NA_to_null (it$field_name ())
grammar_type <- NA_to_null (node_grammar_type (it$node ()))
if (field_name == "function" && grammar_type != "extract_operator") {
if (cli_out) {
cli::cli_h1 ("function")
print (it$node ())
cli::cli_alert_info (cli::col_green ("field name: {it$field_name()}"))
cli::cli_alert_info (cli::col_green ("grammar type: {grammar_type}"))
cli::cli_alert_info (cli::col_green ("grammar symbol: {node_grammar_symbol(it$node())}"))
cli::cli_alert_info (cli::col_green ("node is named: {node_is_named(it$node())}"))
cli::cli_alert_info (cli::col_green ("node text: {node_text(it$node())}"))
cli::cli_alert_info (cli::col_green ("node symbol: {node_symbol(it$node())}"))
}
grammar_types <- c (grammar_types, grammar_type)
node_text <- c (node_text, node_text (it$node ()))
get_next_open <- TRUE
} else if (grammar_type == "identifier" && first_identifier) {
if (cli_out) {
cli::cli_h1 ("identifier")
print (it$node ())
cli::cli_alert_info (cli::col_green ("field name: {it$field_name()}"))
cli::cli_alert_info (cli::col_green ("grammar type: {grammar_type}"))
cli::cli_alert_info (cli::col_green ("grammar symbol: {node_grammar_symbol(it$node())}"))
cli::cli_alert_info (cli::col_green ("node is named: {node_is_named(it$node())}"))
cli::cli_alert_info (cli::col_green ("node text: {node_text(it$node())}"))
cli::cli_alert_info (cli::col_green ("node symbol: {node_symbol(it$node())}"))
}
fn_name <- node_text (it$node ())
first_identifier <- FALSE
} else if (get_next_open && field_name == "open") {
if (cli_out) {
cli::cli_h1 ("next open")
}
next_open <- c (next_open, grammar_type)
get_next_open <- FALSE
}
if (it$goto_first_child ()) next
if (it$goto_next_sibling ()) next
retracing <- TRUE
while (retracing) {
if (!it$goto_parent ()) {
retracing <- FALSE
reached_foot <- TRUE
}
if (it$goto_next_sibling ()) {
retracing <- FALSE
}
}
}
data.frame (
fn_name = fn_name,
grammar_type = grammar_types,
node_text = node_text
) [which (next_open != "["), ]
}
That walk_one_tree()
function can then be used to extract all function calls like this, using one R file from the code base discussed in #61.
library (treesitter)
#>
#> Attaching package: 'treesitter'
#> The following object is masked from 'package:base':
#>
#> range
language <- treesitter.r::language()
parser <- parser(language)
f <- "/<local>/<path>/<to>/teal/R/include_css_js.R"
parse_list <- parse (f)
fn_calls <- lapply (parse_list, function (p) {
txt <- paste0 (as.character (p), collapse = "\n")
tree <- parser_parse(parser, txt)
walk_one_tree (tree)
})
do.call (rbind, fn_calls)
#> fn_name grammar_type node_text
#> 1 include_css_files identifier list.files
#> 2 include_css_files identifier system.file
#> 3 include_css_files identifier singleton
#> 4 include_css_files identifier lapply
#> 11 include_js_files namespace_operator checkmate::assert_character
#> 21 include_js_files identifier list.files
#> 31 include_js_files identifier system.file
#> 5 include_js_files identifier basename
#> 6 include_js_files identifier singleton
#> 7 include_js_files identifier lapply
#> 12 run_js_files namespace_operator checkmate::assert_character
#> 22 run_js_files identifier lapply
#> 32 run_js_files namespace_operator shinyjs::runjs
#> 41 run_js_files identifier paste0
#> 51 run_js_files identifier readLines
#> 61 run_js_files identifier system.file
#> 71 run_js_files identifier invisible
#> 13 include_teal_css_js identifier tagList
#> 23 include_teal_css_js namespace_operator shinyjs::useShinyjs
#> 33 include_teal_css_js identifier include_css_files
#> 42 include_teal_css_js identifier include_js_files
#> 52 include_teal_css_js namespace_operator shinyjs::hidden
#> 62 include_teal_css_js identifier icon
Created on 2024-09-11 with reprex v2.1.1
Here's an improved version of the function, minus the debugging lines:
walk_one_tree <- function (tree) {
it <- treesitter::tree_walk (tree)
reached_foot <- FALSE
first_identifier <- TRUE
get_next_open <- FALSE
grammar_types <- node_text <- next_open <- fn_name <- character (0L)
while (!reached_foot) {
field_name <- NA_to_null (it$field_name ())
grammar_type <- NA_to_null (treesitter::node_grammar_type (it$node ()))
if (field_name == "function" && !grammar_type %in% c ("call", "extract_operator")) {
grammar_types <- c (grammar_types, grammar_type)
node_text <- c (node_text, treesitter::node_text (it$node ()))
get_next_open <- TRUE
} else if (grammar_type == "identifier" && first_identifier) {
fn_name <- treesitter::node_text (it$node ())
first_identifier <- FALSE
} else if (get_next_open && field_name == "open") {
next_open <- c (next_open, grammar_type)
get_next_open <- FALSE
}
if (it$goto_first_child ()) next
if (it$goto_next_sibling ()) next
retracing <- TRUE
while (retracing) {
if (!it$goto_parent ()) {
retracing <- FALSE
reached_foot <- TRUE
}
if (it$goto_next_sibling ()) {
retracing <- FALSE
}
}
}
# This line ensures fn_name is also length 0 when no data are parsed:
fn_name <- rep (fn_name, length (grammar_types))
data.frame (
fn_name = fn_name,
grammar_type = grammar_types,
node_text = node_text
) [which (next_open != "["), ]
}
And here is a comparison with current ctags
approach, as well as with checkglobals
as suggested by @pawelru in #61. Note that checkglobals
does not trace all function calls, rather it only identifies single unique function calls, and so returns only unique names of functions.
Function to use treesitter
to trace all calls in R/
directory of package, via the walk_one_tree()
function defined above:
language <- treesitter.r::language()
parser <- treesitter::parser(language)
path <- "/<local>/<path>/<to>/teal"
tree_sitter_calls <- function (path) {
flist <- fs::dir_ls (path, pattern = "\\.R$")
fn_calls <- lapply (flist, function (f) {
parse_list <- pkgstats:::control_parse (f)
fn_calls <- lapply (parse_list, function (p) {
txt <- paste0 (as.character (p), collapse = "\n")
tree <- treesitter::parser_parse(parser, txt)
walk_one_tree (tree)
})
res <- do.call (rbind, fn_calls)
cbind (file = rep (f, nrow (res)), res)
})
fn_calls <- do.call (rbind, fn_calls)
}
Then code to compare the three approaches:
ctags_calls <- function (path) {
withr::with_dir (path, pkgstats:::get_ctags ("R", has_tabs = FALSE))
}
t0 <- system.time (
tags_ctags <- ctags_calls (path)
)
t1 <- system.time (
tags_tree_sitter <- tree_sitter_calls (file.path (path, "R"))
)
t2 <- system.time (
tags_checkglobals <- as.data.frame (checkglobals::check_pkg(path))
)
Then comparison of calculation times:
message (
"times (ctags; ts; checkglobals) = (",
round (t0 [3], digits = 2),
"; ",
round (t1 [3], digits = 2),
"; ",
round (t2 [3], digits = 2),
"); ratios to ctags = ",
round (t1 [3] / t0 [3], digits = 1),
"; ",
round (t2 [3] / t0 [3], digits = 1)
)
#> times (ctags; ts; checkglobals) = (0.34; 1.19; 0.22); ratios to ctags = 3.5; 0.6
And finally numbers of identified calls:
message (
"numbers of tags (ctags; ts; checkglobals) = (",
format (nrow (tags_ctags), big.mark = ","),
"; ",
format (nrow (tags_tree_sitter), big.mark = ","),
"; ",
format (nrow (tags_checkglobals), big.mark = ","),
")"
)
#> numbers of tags (ctags; ts; checkglobals) = (890; 2,214; 170)
Created on 2024-09-11 with reprex v2.1.1
This issue now has to be paused because of https://github.com/DavisVaughan/r-tree-sitter/issues/21#issuecomment-2343440516. The bundled versions of all relevant C libraries are currently in https://github.com/ropensci-review-tools/pkgsimil, but that makes for a 24MB src/
directory, so is not a viable approach for any package intended for CRAN.
The playground on the main docs site demonstrates that tree-sitter indeed generates everything that is currently extracted by combining
ctags
+gtags
.