Open al-obrien opened 4 months ago
To improve performance, if want to have various patterns searched across several files in a path, then load once, and apply each set while that is loaded (to avoid many load operations)
# Get file paths and names thereof
file_list <- list.files(normalizePath('C:\\Users\\Documents\\Projects\\'), recursive = TRUE, pattern = '\\.R$', full.names = TRUE)
file_names <- basename(file_list)
file_search_f <- function(file, patterns ) {
fileR <- readr::read_file(file, locale = readr::default_locale())
patterns[sapply(patterns, function(x) grepl(pattern = x, x = fileR))]
}
tmp_list <- lapply(as.list(file_list), function(x) file_search_f(x,c('dplyr', 'tidyr')))
names(tmp_list) <- file_names
If readLines is faster, could try this too instead (seems to be 2-3 times faster...)
file_search_f2 <- function(file, patterns ) {
fileR <- readLines(file)
patterns[sapply(patterns, function(x) any(grepl(pattern = x, x = fileR)))]
}
Updated option
find_file_content <- function(path, file_pattern = NULL, text_pattern, recursive = FALSE, encoding = 'unknown', as.data.frame = TRUE, normalize_path = TRUE, abbrv = FALSE, ...) {
if(normalize_path) path <- unique(normalizePath(path))
file_list <- unique(list.files(path, recursive = recursive, pattern = file_pattern, full.names = TRUE))
if(length(file_list)==0) invisible(NULL)
file_names <- basename(file_list)
srch_f <- function(file, text_pattern, encoding = encoding, ...) {
file_in <- readLines(file, encoding = encoding)
vapply(text_pattern,
FUN = function(patt) {any(grepl(pattern = patt, x = file_in, ...))},
FUN.VALUE = logical(1))
}
# Use FURRR to allow to loop over files
pttn_l <- length(text_pattern)
srch_rslt <- furrr::future_map(as.list(file_list),
function(x) srch_f(x, text_pattern, encoding, ...))
# Convert to matrix version
srch_rslt <- matrix(unlist(srch_rslt, use.names = FALSE), nrow = pttn_l, byrow = FALSE)
rownames(srch_rslt) <- text_pattern
# Vapply to loop across list (faster if not long list than furrr)
# srch_rslt <- vapply(X = as.list(file_list),
# FUN = function(x) srch_f(x, text_pattern, encoding, ...),
# FUN.VALUE = logical(pttn_l))
# Correction on 1 dimension returns (ensure still a matrix)
# if(pttn_l == 1) {
# srch_rslt <- t(as.matrix(srch_rslt))
# rownames(srch_rslt) <- text_pattern
# }
if(abbrv) colnames(srch_rslt) <- file_names else colnames(srch_rslt) <- file_list
if(as.data.frame) {
true_idx <- unname(which(srch_rslt, arr.ind = TRUE))
srch_rslt <- data.frame(text_pattern = text_pattern,
found = unname(rowSums(srch_rslt) > 0),
file = NA_character_,
stringsAsFactors = FALSE)
if(abbrv) nms2use <- file_names else nms2use <- file_list
file_insert <- vapply(split(true_idx[,2], true_idx[,1]),
function(x) paste0(nms2use[x], collapse = ', '),
FUN.VALUE = character(1))
srch_rslt[as.numeric(names(file_insert)), 'file'] <- file_insert
}
srch_rslt
}
Currently
find_file_content
only takes one path, but would be nice to have a set of files searched. This could be done by looping over thelist.files
step in the function and finding unique ones...Furthermore, a full path may be preferable when several paths chosen as an output.
There may also be some improvements by having readLines used instead of read_file...