Sage-Bionetworks / challengescoring

This R package provides scoring mechanisms for computational challenges and implements the bayesBootLadderBoot approach for avoiding test data leakage.
Apache License 2.0
3 stars 0 forks source link

add validation functions #14

Open allaway opened 4 years ago

allaway commented 4 years ago

I've reused a lot of validation functions. seems like these could be easily turned into package functions

Examples:


trim_vec <- function(vec, trim = 10){
  if(length(vec) > trim){
    vec <- vec[1:trim]
    vec <- as.character(vec)
    vec[trim+1] <- '...'
  }else{
    vec
  }
}

validate <- function(prediction_path, template_path){

  pred <- readr::read_csv(prediction_path)
  temp <- readr::read_csv(template_path)

  ###configure validation
  ncol_req <- ncol(temp)
  nrow_req <- nrow(temp)
  colnames_req <- colnames(temp)
  target_ids <- unique(temp$target)

  errs <- list()

  if(ncol(pred)<ncol_req){
    errs["ncol_short"] <- paste0("Prediction file is missing cols. Only ", ncol(pred), " cols detected.")
  }

  if(ncol(pred)>ncol_req){
    errs["ncol_long"] <- paste0("Prediction file has extra  cols ", ncol(pred), " cols detected.")
  }

  if(nrow(pred)<nrow_req){
    errs["nrow_short"] <- paste0("Prediction file is missing rows Only ", nrow(pred), " rows detected.")
  }

  if(nrow(pred)>nrow_req){
    errs["nrow_long"] <- paste0("Prediction file has extra  rows ", nrow(pred), " rows detected.")
  }

  if(isTRUE(colnames(pred) %in% temp)){
    errs["colnames"] <- paste0("Column names are not correct. Column names must be ", cat(colnames_req))
  }

  if(!(all(pred[-1] >= 0) & all(pred[-1] <= 1))){
    errs["wrong_range"] <- paste0("Confidence values are not between 0 and 1.")
  }

  if(!all(apply(pred[-1], 1:2, is.numeric))){
    errs["non_numeric"] <- paste0("Predictions are not all numeric values.")
  }

  if(any(!pred$target %in% target_ids)){
    invalid <- unique(pred$target[!pred$target %in% target_ids]) %>% trim_vec()
    errs["non_target"] <- paste0("Invalid target identifiers were included in your prediction file (up to 10 displayed): ", invalid)
  }

  return(errs)
}