Open mpadge opened 1 month ago
Using same test repo from #61
path <- "/<local>/<path>/<to>/teal"
t0 <- system.time (
tags_ctags <- withr::with_dir (path, pkgstats:::get_ctags ("R", has_tabs = FALSE))
)
t1 <- system.time (
tags_tree_sitter <- pkgstats:::get_treesitter_tags (path)
)
t2 <- system.time (
tags_checkglobals <- as.data.frame (checkglobals::check_pkg(path))
)
Then the output:
message (
"times (ctags; ts; checkglobals) = (",
round (t0 [3], digits = 2),
"; ",
round (t1 [3], digits = 2),
"; ",
round (t2 [3], digits = 2),
"); ratios to ctags = ",
round (t1 [3] / t0 [3], digits = 1),
"; ",
round (t2 [3] / t0 [3], digits = 1)
)
#> times (ctags; ts; checkglobals) = (0.12; 1.24; 0.1); ratios to ctags = 10; 0.8
And numbers of tagged function calls:
message (
"numbers of tags (ctags; ts; checkglobals) = (",
format (nrow (tags_ctags), big.mark = ","),
"; ",
format (nrow (tags_tree_sitter), big.mark = ","),
"; ",
format (nrow (tags_checkglobals), big.mark = ","),
")"
)
#> numbers of tags (ctags; ts; checkglobals) = (890; 2,214; 170)
Created on 2024-09-13 with reprex v2.1.1
Attention: Patch coverage is 95.60440%
with 4 lines
in your changes missing coverage. Please review.
Project coverage is 73.06%. Comparing base (
3cbf7f1
) to head (d7bf168
).
Files with missing lines | Patch % | Lines |
---|---|---|
R/external-calls.R | 81.81% | 4 Missing :warning: |
:umbrella: View full report in Codecov by Sentry.
:loudspeaker: Have feedback on the report? Share it here.
Comparison of identified external calls between ctags and tree-sitter:
compare_external_calls <- function (path) {
has_tabs <- FALSE
pkg_name <- basename (path)
tags_r <- withr::with_dir (path, get_ctags ("R", has_tabs))
ex_r <- pkgstats:::external_call_network (tags_r, path, pkg_name)
tags_r <- get_treesitter_tags (path)
ex_t <- pkgstats:::external_call_network (tags_r, path, pkg_name)
n <- c (
nrow (ex_r),
nrow (ex_t),
length (which (!ex_r$package == "base")),
length (which (!ex_t$package == "base"))
)
res <- data.frame (
tag_src = c ("ctags", "treesit", "ctags_no_base", "treesit_no_base"),
n = format (n, big.mark = ",")
)
rel_diff_all <- round (100 * (n [2] / n [1] - 1))
rel_diff_no_base <- round (100 * (n [4] / n [3] - 1))
cli::cli_alert_info ("tree-sitter detects {rel_diff_all}% more tags in total, and {rel_diff_no_base}% of non-base calls")
res
}
Then results from a few packages:
compare_external_calls ("/<local>/<path>/<to>/teal")
#> ℹ tree-sitter detects 59% more tags in total, and 16% of non-base calls
#> tag_src n
#> 1 ctags 1,091
#> 2 treesit 1,737
#> 3 ctags_no_base 495
#> 4 treesit_no_base 574
compare_external_calls ("/<local>/<path>/<to>/dodgr")
#> ℹ tree-sitter detects 118% more tags in total, and 27% of non-base calls
#> tag_src n
#> 1 ctags 1,271
#> 2 treesit 2,776
#> 3 ctags_no_base 522
#> 4 treesit_no_base 665
compare_external_calls ("/<local>/<path>/<to>/pkgstats")
#> ℹ tree-sitter detects 158% more tags in total, and 25% of non-base calls
#> tag_src n
#> 1 ctags 1,030
#> 2 treesit 2,657
#> 3 ctags_no_base 302
#> 4 treesit_no_base 377
Created on 2024-09-13 with reprex v2.1.1
So there's quite a big difference in identified calls to base R functions, but beyond that differences are generally only around 20%, which may well be tolerable...? The tree-sitter
analyses really are enormously more complicated, and currently much slower, so at least in this pure-R form, likely not really worth implementing here for the moment.
FWIW I think if you can figure out how to make one or more tree-sitter queries that grab all the information you want, then you'd be pretty happy. It's pretty darn fast that way (not manually walking the tree at the R level).
Most of the remaining time is reading the files with brio which is unavoidable, and the vapply()
s to extract out the info you care about.
(Note this doesn't return everything you seem to care about, but its kinda close from what i can tell)
(For some reason the query()
function is quite expensive so its best to do it once up front)
library(treesitter)
language <- treesitter.r::language()
QUERY_FUNCTIONS <- r"(
(binary_operator
lhs: (identifier) @name
operator: "<-"
rhs: (function_definition) @fn
)
)"
QUERY_FUNCTIONS <- query(language, QUERY_FUNCTIONS)
QUERY_CALLS <- r"(
(call
function: [
(identifier) @name
(namespace_operator) @name
]
)
)"
QUERY_CALLS <- query(language, QUERY_CALLS)
get_calls_in_functions <- function(node) {
functions <- query_captures(QUERY_FUNCTIONS, node)
names <- functions$node[functions$name == "name"]
bodies <- functions$node[functions$name == "fn"]
tibble::new_tibble(list(
fn = vapply(names, node_text, character(1)),
info = lapply(bodies, get_calls)
))
}
get_calls <- function(node) {
captures <- query_captures(QUERY_CALLS, node)
name <- vapply(captures$node, function(node) node_text(node), character(1))
start <- vapply(captures$node, function(node) point_row(node_start_point(node)), double(1))
end <- vapply(captures$node, function(node) point_row(node_end_point(node)), double(1))
tibble::new_tibble(list(
name = name,
start = start,
end = end
))
}
get_calls_in_package <- function(path) {
path_r <- file.path(path, "R")
paths <- fs::dir_ls(path_r)
paths <- as.character(paths)
parser <- parser(treesitter.r::language())
out <- vector("list", length = length(paths))
for (i in seq_along(out)) {
path <- paths[[i]]
text <- brio::read_file(path)
tree <- parser_parse(parser, text)
node <- tree_root_node(tree)
elt <- get_calls_in_functions(node)
elt[["file"]] <- as.character(path)
out[[i]] <- elt
}
out <- vctrs::list_unchop(out)
out <- tidyr::unnest(out, info)
out
}
path <- "~/files/r/playground/packages/teal"
bench::mark(
get_calls_in_package(path),
pkgstats:::get_treesitter_tags(path),
check = FALSE
)
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl>
#> 1 get_calls_in_package(path) 44.1ms 46.8ms 21.0 10.2MB 28.7
#> 2 pkgstats:::get_treesitter_tags(p… 582.1ms 582.1ms 1.72 11.5MB 41.2
Created on 2024-09-18 with reprex v2.0.2
Thanks so much @DavisVaughan, that's a brilliant solution! I'll definitely implement a version of that then.
Just keep in mind that this whole system currently tags everything with ctags
compiled with all languages, so just cruises across language barriers without a glitch, enabling object-reference networks like this (may take time to load; and is auto-generated as part of our software review process, in that case from this review). And that is my primary motivation for https://github.com/DavisVaughan/r-tree-sitter/issues/21, so let me know if I can help there in any way.
This is not intended to be merged just yet, because walking the tree from R is currently too slow. It's a proof-of-principle until some of the C functions from {treesitter} and {treesitter.r} can be exposed.