trinker / textreadr

Tools to uniformly read in text data including semi-structured transcripts
74 stars 5 forks source link

Add a `read_dir_transcript` #4

Closed trinker closed 7 years ago

trinker commented 7 years ago
path <- system.file("docs/transcripts", package = 'textreadr')
list_files <- textreadr:::list_files

read_dir_transcript <- function(path, col.names = c("Document", "Person", "Dialogue"), 
    pattern = NULL, all.files = FALSE, 
    recursive = FALSE, skip = 0, ...) {

    to_read_in <- list_files(path, all.files = all.files, full.names = TRUE, recursive = recursive)
    if (identical(character(0), to_read_in)) {
        stop("The following location does not appear to contain files:\n   -", path)
    }

    if (length(skip) == 1) skip <- rep(skip, length(to_read_in))

    reads <- Map(function(x, y) {
        try(read_transcript(x, col.names = col.names[-1], skip = y, ...))
    }, to_read_in, skip)

    names(reads) <- tools::file_path_sans_ext(basename(to_read_in))

#     
#     args <- names(formals(read_dir_transcript))[-c(1:2)]
#     args <- args[-length(args)] 
#     args <- args[!args %in% 'text'] 
# 
#     len <-length(to_read_in)
#     arg_list <- vector(mode = 'list', length(args))
#     names(arg_list) <- args
# 
#     for (i in seq_along(args)){
#         arg <- eval(parse(text=args[i]))
#         if (is.null(arg)) arg <- lapply(seq_len(len), function(i) NULL)
#         if (length(arg) == 1) {
#             arg <- as.list(rep(arg, len))
#         }
#         if (!is.list(arg)) arg <- as.list(arg)
#         if (length(arg) != len) {
#             stop('Argument lengths must be either length 1 or equal in length to documents in `path`')
#         }
#         arg_list[[i]] <- arg
#     }
# 
#     doc_list <- vector(mode = 'list', length(args))
#     
# browser()
# 
#     for (i in seq_len(len)){
#         paste(names(arg_list), lapply(arg_list, `[[`, i)), sep = " = ")
#         doc_list[[i]] <- try(read_transcript(to_read_in[i],  col.names[-1], ))
#     }

    goods <- !sapply(reads, inherits, 'try-error')
    if (any(!goods)) {
        warning(paste0("The following files did not read in correctly:\n",
            paste0('  - ', to_read_in[!goods], collapse = "\n")
        ))
    }
    textshape::tidy_list(reads[goods], col.names[1])

}

p_load(tidyverse)
peek <- textreadr::peek
separator needs to be repeated or a detect separator
skips <- c(0, 1, 1, 0, 0, 1)
path <- system.file("docs/transcripts", package = 'textreadr')

path %>%
    read_dir_transcript(skip = skips) %>%
    textclean::filter_row("Person", "^\\[") %>%
    mutate(
        Person = stringi::stri_replace_all_regex(Person, "(^/\\s*)|(:\\s*$)", "") %>% 
            trimws(),
        Dialogue = stringi::stri_replace_all_regex(Dialogue, "(^/\\s*)", "") 
    ) %>%
    peek(Inf)

textreadr::peek(read_dir_transcript(path, skip = skips), Inf)