al-obrien / farrago

GNU General Public License v3.0
3 stars 0 forks source link

Add more path options to find_file_content #6

Open al-obrien opened 3 months ago

al-obrien commented 3 months ago

Currently find_file_content only takes one path, but would be nice to have a set of files searched. This could be done by looping over the list.files step in the function and finding unique ones...

Furthermore, a full path may be preferable when several paths chosen as an output.

There may also be some improvements by having readLines used instead of read_file...

find_file_content <- function(path, file_pattern = NULL, text_pattern, recursive = FALSE, locale = readr::default_locale(), ...) {

  file_list <- list.files(path, recursive = recursive, pattern = file_pattern, full.names = TRUE)
  file_names <- basename(file_list)

  file_names[sapply(X = as.list(file_list), FUN = function(x) grepl(pattern = text_pattern, readr::read_file(x, locale = locale), ...))]
}
al-obrien commented 3 months ago

To improve performance, if want to have various patterns searched across several files in a path, then load once, and apply each set while that is loaded (to avoid many load operations)

# Get file paths and names thereof
file_list <- list.files(normalizePath('C:\\Users\\Documents\\Projects\\'), recursive = TRUE, pattern = '\\.R$',  full.names = TRUE)
file_names <- basename(file_list)

file_search_f <- function(file, patterns ) {
    fileR <- readr::read_file(file, locale = readr::default_locale())
    patterns[sapply(patterns, function(x) grepl(pattern = x, x = fileR))]
}
tmp_list <- lapply(as.list(file_list), function(x) file_search_f(x,c('dplyr', 'tidyr')))
names(tmp_list) <- file_names

If readLines is faster, could try this too instead (seems to be 2-3 times faster...)

file_search_f2 <- function(file, patterns ) {
    fileR <- readLines(file)
    patterns[sapply(patterns, function(x) any(grepl(pattern = x, x = fileR)))]
}
al-obrien commented 2 months ago

Updated option

find_file_content <- function(path, file_pattern = NULL, text_pattern, recursive = FALSE, encoding = 'unknown', as.data.frame = TRUE, normalize_path = TRUE, abbrv = FALSE, ...) {

  if(normalize_path) path <- unique(normalizePath(path))
  file_list <- unique(list.files(path, recursive = recursive, pattern = file_pattern, full.names = TRUE))
  if(length(file_list)==0) invisible(NULL)
  file_names <- basename(file_list)

  srch_f <- function(file, text_pattern, encoding = encoding, ...) {
    file_in <- readLines(file, encoding = encoding)
    vapply(text_pattern,
           FUN = function(patt) {any(grepl(pattern = patt, x = file_in, ...))},
           FUN.VALUE = logical(1))
  }

  # Use FURRR to allow to loop over files
  pttn_l <- length(text_pattern)
  srch_rslt <- furrr::future_map(as.list(file_list),
                                 function(x) srch_f(x, text_pattern, encoding, ...))

  # Convert to matrix version
  srch_rslt <- matrix(unlist(srch_rslt, use.names = FALSE), nrow = pttn_l, byrow = FALSE)
  rownames(srch_rslt) <- text_pattern

  # Vapply to loop across list (faster if not long list than furrr)
  # srch_rslt <- vapply(X = as.list(file_list),
  #                     FUN = function(x) srch_f(x, text_pattern, encoding, ...),
  #                     FUN.VALUE = logical(pttn_l))

  # Correction on 1 dimension returns (ensure still a matrix)
  # if(pttn_l == 1) {
  #   srch_rslt <- t(as.matrix(srch_rslt))
  #   rownames(srch_rslt) <- text_pattern
  # }

  if(abbrv) colnames(srch_rslt) <- file_names else colnames(srch_rslt) <- file_list

  if(as.data.frame) {
    true_idx <- unname(which(srch_rslt, arr.ind = TRUE))
    srch_rslt <- data.frame(text_pattern = text_pattern,
                            found = unname(rowSums(srch_rslt) > 0),
                            file = NA_character_,
                            stringsAsFactors = FALSE)
    if(abbrv) nms2use <- file_names else nms2use <- file_list
    file_insert <- vapply(split(true_idx[,2], true_idx[,1]),
                          function(x) paste0(nms2use[x], collapse = ', '),
                          FUN.VALUE = character(1))
    srch_rslt[as.numeric(names(file_insert)), 'file'] <- file_insert

  }

  srch_rslt
}