ropensci-review-tools / pkgstats

Historical statistics of every R package ever
https://docs.ropensci.org/pkgstats/
17 stars 1 forks source link

use tree-sitter to parse everything #4

Open mpadge opened 3 years ago

mpadge commented 3 years ago

The playground on the main docs site demonstrates that tree-sitter indeed generates everything that is currently extracted by combining ctags + gtags.

mpadge commented 2 months ago

This uses Davis Vaughan's implementation of tree-sitter in r, based on tree-walking code I've already implemented in pkgsimil. Davis's version exposes the tree cursor directly in R, and does not expose the tree-sitter api.h file, so all of these loops must be done in R rather than C. That makes this proof-of-principle only; any real implementations would have to be done in C like in pkgsimil.

This code includes debugging lines switched off with cli_out <- FALSE at the outset.

NA_to_null <- function (i) ifelse (is.na (i), "", i)
walk_one_tree <- function (tree) {

    cli_out <- FALSE
    it <- tree_walk (tree)

    reached_foot <- FALSE
    first_identifier <- TRUE
    get_next_open <- FALSE
    grammar_types <- node_text <- next_open <- fn_name <- character (0L)
    while (!reached_foot) {
        field_name <- NA_to_null (it$field_name ())
        grammar_type <- NA_to_null (node_grammar_type (it$node ()))
        if (field_name == "function" && grammar_type != "extract_operator") {
            if (cli_out) {
                cli::cli_h1 ("function")
                print (it$node ())
                cli::cli_alert_info (cli::col_green ("field name: {it$field_name()}"))
                cli::cli_alert_info (cli::col_green ("grammar type: {grammar_type}"))
                cli::cli_alert_info (cli::col_green ("grammar symbol: {node_grammar_symbol(it$node())}"))
                cli::cli_alert_info (cli::col_green ("node is named: {node_is_named(it$node())}"))
                cli::cli_alert_info (cli::col_green ("node text: {node_text(it$node())}"))
                cli::cli_alert_info (cli::col_green ("node symbol: {node_symbol(it$node())}"))
            }
            grammar_types <- c (grammar_types, grammar_type)
            node_text <- c (node_text, node_text (it$node ()))
            get_next_open <- TRUE
        } else if (grammar_type == "identifier" && first_identifier) {
            if (cli_out) {
                cli::cli_h1 ("identifier")
                print (it$node ())
                cli::cli_alert_info (cli::col_green ("field name: {it$field_name()}"))
                cli::cli_alert_info (cli::col_green ("grammar type: {grammar_type}"))
                cli::cli_alert_info (cli::col_green ("grammar symbol: {node_grammar_symbol(it$node())}"))
                cli::cli_alert_info (cli::col_green ("node is named: {node_is_named(it$node())}"))
                cli::cli_alert_info (cli::col_green ("node text: {node_text(it$node())}"))
                cli::cli_alert_info (cli::col_green ("node symbol: {node_symbol(it$node())}"))
            }
            fn_name <- node_text (it$node ())
            first_identifier <- FALSE
        } else if (get_next_open && field_name == "open") {
            if (cli_out) {
                cli::cli_h1 ("next open")
            }
            next_open <- c (next_open, grammar_type)
            get_next_open <- FALSE
        }

        if (it$goto_first_child ()) next
        if (it$goto_next_sibling ()) next

        retracing <- TRUE
        while (retracing) {
            if (!it$goto_parent ()) {
                retracing <- FALSE
                reached_foot <- TRUE
            }
            if (it$goto_next_sibling ()) {
                retracing <- FALSE
            }
        }
    }

    data.frame (
        fn_name = fn_name,
        grammar_type = grammar_types,
        node_text = node_text
    ) [which (next_open != "["), ]
}

That walk_one_tree() function can then be used to extract all function calls like this, using one R file from the code base discussed in #61.

library (treesitter)
#> 
#> Attaching package: 'treesitter'
#> The following object is masked from 'package:base':
#> 
#>     range
language <- treesitter.r::language()
parser <- parser(language)
f <- "/<local>/<path>/<to>/teal/R/include_css_js.R"
parse_list <- parse (f)

fn_calls <- lapply (parse_list, function (p) {
    txt <- paste0 (as.character (p), collapse = "\n")
    tree <- parser_parse(parser, txt)
    walk_one_tree (tree)
})
do.call (rbind, fn_calls)
#>                fn_name       grammar_type                   node_text
#> 1    include_css_files         identifier                  list.files
#> 2    include_css_files         identifier                 system.file
#> 3    include_css_files         identifier                   singleton
#> 4    include_css_files         identifier                      lapply
#> 11    include_js_files namespace_operator checkmate::assert_character
#> 21    include_js_files         identifier                  list.files
#> 31    include_js_files         identifier                 system.file
#> 5     include_js_files         identifier                    basename
#> 6     include_js_files         identifier                   singleton
#> 7     include_js_files         identifier                      lapply
#> 12        run_js_files namespace_operator checkmate::assert_character
#> 22        run_js_files         identifier                      lapply
#> 32        run_js_files namespace_operator              shinyjs::runjs
#> 41        run_js_files         identifier                      paste0
#> 51        run_js_files         identifier                   readLines
#> 61        run_js_files         identifier                 system.file
#> 71        run_js_files         identifier                   invisible
#> 13 include_teal_css_js         identifier                     tagList
#> 23 include_teal_css_js namespace_operator         shinyjs::useShinyjs
#> 33 include_teal_css_js         identifier           include_css_files
#> 42 include_teal_css_js         identifier            include_js_files
#> 52 include_teal_css_js namespace_operator             shinyjs::hidden
#> 62 include_teal_css_js         identifier                        icon

Created on 2024-09-11 with reprex v2.1.1

mpadge commented 2 months ago

Here's an improved version of the function, minus the debugging lines:

walk_one_tree <- function (tree) {

    it <- treesitter::tree_walk (tree)

    reached_foot <- FALSE
    first_identifier <- TRUE
    get_next_open <- FALSE
    grammar_types <- node_text <- next_open <- fn_name <- character (0L)
    while (!reached_foot) {
        field_name <- NA_to_null (it$field_name ())
        grammar_type <- NA_to_null (treesitter::node_grammar_type (it$node ()))
        if (field_name == "function" && !grammar_type %in% c ("call", "extract_operator")) {
            grammar_types <- c (grammar_types, grammar_type)
            node_text <- c (node_text, treesitter::node_text (it$node ()))
            get_next_open <- TRUE
        } else if (grammar_type == "identifier" && first_identifier) {
            fn_name <- treesitter::node_text (it$node ())
            first_identifier <- FALSE
        } else if (get_next_open && field_name == "open") {
            next_open <- c (next_open, grammar_type)
            get_next_open <- FALSE
        }

        if (it$goto_first_child ()) next
        if (it$goto_next_sibling ()) next

        retracing <- TRUE
        while (retracing) {
            if (!it$goto_parent ()) {
                retracing <- FALSE
                reached_foot <- TRUE
            }
            if (it$goto_next_sibling ()) {
                retracing <- FALSE
            }
        }
    }

    # This line ensures fn_name is also length 0 when no data are parsed:
    fn_name <- rep (fn_name, length (grammar_types))
    data.frame (
        fn_name = fn_name,
        grammar_type = grammar_types,
        node_text = node_text
    ) [which (next_open != "["), ]
}
mpadge commented 2 months ago

And here is a comparison with current ctags approach, as well as with checkglobals as suggested by @pawelru in #61. Note that checkglobals does not trace all function calls, rather it only identifies single unique function calls, and so returns only unique names of functions.

Function to use treesitter to trace all calls in R/ directory of package, via the walk_one_tree() function defined above:

language <- treesitter.r::language()
parser <- treesitter::parser(language)
path <- "/<local>/<path>/<to>/teal"

tree_sitter_calls <- function (path) {
    flist <- fs::dir_ls (path, pattern = "\\.R$")
    fn_calls <- lapply (flist, function (f) {
        parse_list <- pkgstats:::control_parse (f)
        fn_calls <- lapply (parse_list, function (p) {
            txt <- paste0 (as.character (p), collapse = "\n")
            tree <- treesitter::parser_parse(parser, txt)
            walk_one_tree (tree)
        })
        res <- do.call (rbind, fn_calls)
        cbind (file = rep (f, nrow (res)), res)
    })
    fn_calls <- do.call (rbind, fn_calls)
}

Then code to compare the three approaches:

ctags_calls <- function (path) {
    withr::with_dir (path, pkgstats:::get_ctags ("R", has_tabs = FALSE))
}

t0 <- system.time (
    tags_ctags <- ctags_calls (path)
)
t1 <- system.time (
    tags_tree_sitter <- tree_sitter_calls (file.path (path, "R"))
)
t2 <- system.time (
    tags_checkglobals <- as.data.frame (checkglobals::check_pkg(path))
)

Then comparison of calculation times:

message (
    "times (ctags; ts; checkglobals) = (", 
    round (t0 [3], digits = 2),
    "; ", 
    round (t1 [3], digits = 2),
    "; ", 
    round (t2 [3], digits = 2),
    "); ratios to ctags = ", 
    round (t1 [3] / t0 [3], digits = 1),
    "; ", 
    round (t2 [3] / t0 [3], digits = 1)
)
#> times (ctags; ts; checkglobals) = (0.34; 1.19; 0.22); ratios to ctags = 3.5; 0.6

And finally numbers of identified calls:

message (
    "numbers of tags (ctags; ts; checkglobals) = (", 
    format (nrow (tags_ctags), big.mark = ","),
    "; ", 
    format (nrow (tags_tree_sitter), big.mark = ","),
    "; ", 
    format (nrow (tags_checkglobals), big.mark = ","),
    ")"
)
#> numbers of tags (ctags; ts; checkglobals) = (890; 2,214; 170)

Created on 2024-09-11 with reprex v2.1.1

mpadge commented 2 months ago

This issue now has to be paused because of https://github.com/DavisVaughan/r-tree-sitter/issues/21#issuecomment-2343440516. The bundled versions of all relevant C libraries are currently in https://github.com/ropensci-review-tools/pkgsimil, but that makes for a 24MB src/ directory, so is not a viable approach for any package intended for CRAN.