DOI-USGS / meddle

Tools for metadata creation and data releases
Other
0 stars 10 forks source link

automate validation of sb item files #69

Open lindsayplatt opened 3 years ago

lindsayplatt commented 3 years ago

Would be nice if there were some functions that validated whether your metadata captured all of the files on SB and whether the attributes included everything they should (or if you had extra stuff you didn't need).

lindsayplatt commented 3 years ago

This is by no means automated but I did use these functions to do some validation of each of the metadata files in mntoha-data-release:

# Validating that metadata ymls contain all the info they need to

##### Step 0: Setup packages & credentials #####

library(sbtools)
library(yaml)
library(tidyverse)
library(readr)

authenticate_sb()

##### Step 1: Do all of the files exist in the metadata? #####

verify_files_exist <- function(sbid_to_check, local_metadata_file) {
  sbid_files <- item_list_files(sbid_to_check)$fname
  sbid_data_files <- sbid_files[!grepl(".xml", sbid_files)] # Remove the XML ones as those are the metadata files

  metadata <- suppressWarnings(yaml.load_file(local_metadata_file)) # There is a warning about some imminent change that always shows up
  metadata_data_files <- sapply(metadata$entities, function(x) x[["data-name"]])
  metadata_df <- tibble(Metadata_Files = c(metadata_data_files, rep("", length(sbid_data_files) - length(metadata_data_files))))

  file_comparison_df <- tibble(SB_Files = sbid_data_files) %>% arrange(SB_Files) %>%
    bind_cols(metadata_df)

  # TODO: Using the `menu` feature to require manual checks on whether they are represented. Otherwise,
  #   would need to do way more sophisticated regex work, but don't have time for that right now.
  message(paste0(capture.output(as.data.frame(file_comparison_df)), collapse = "\n"))
  message("\nAre all the files in the `SB_Files` column represented by the names in the `Metadata_Files` column?")
  isCorrect <- menu(c("Yes", "No"))
  if(isCorrect == 1) {
    message("All files captured in metadata")
  } else {
    stop("You selected `No`")
  }

}

##### Step 2: Are the contents of the files represented in the metadata? #####

verify_data_and_metadata_match <- function(local_metadata_file) {

  metadata <- yaml.load_file(local_metadata_file)
  metadata_data_files <- sapply(metadata$entities, function(x) x[["data-name"]])
  metadata_data_file_cols <- lapply(metadata$entities, function(entity) {
    sapply(entity[["attributes"]], function(x) x[["attr-label"]])
  })
  names(metadata_data_file_cols) <- metadata_data_files

  # For each of the files (or file groups), you will need to specify an actual, local
  # file to use to compare to the data attributes.

  for(n in seq_along(metadata_data_file_cols)) {
    if(n > 1) Sys.sleep(3) # Pause before the next file.choose window pops up

    current_file <- names(metadata_data_file_cols)[n]

    message(sprintf("Choose the file to validate against `%s`\n", current_file))
    fn <- file.choose()

    if(tools::file_ext(fn) != "csv") stop("Only supporting CSVs right now.")

    names_of_data <- names(read_csv(fn, col_types = cols()))
    names_of_metadata <- metadata_data_file_cols[[n]]
    cols_described_in_metadata <- names_of_data %in% names_of_metadata
    metadata_present_in_cols <- names_of_metadata %in% names_of_data

    # TODO: if metadata is a pattern, it won't match appropriately. Would require some regex work
    if(all(cols_described_in_metadata) & all(metadata_present_in_cols)) {
      message(sprintf("All columns and metadata match for `%s`\n", current_file))
    } else {

      # Columns are missing from metadata
      if(!all(cols_described_in_metadata)) {
        message(sprintf("For `%s`, the following columns are missing from the metadata:\n%s\n", current_file,
                        paste0(names_of_data[!cols_described_in_metadata], collapse="\n")))
      }

      # Metadata describes something not present in the data
      if(!all(metadata_present_in_cols)) {
        message(sprintf("For `%s`, the following metadata does not exist in the actual data:\n%s\n", current_file,
                        paste0(names_of_metadata[!metadata_present_in_cols], collapse="\n")))
      }
    }

  }
}

##### Step 3: Use the functions! #####

# 01_Spatial Check
metadata_01 <- "in_text/text_01_spatial.yml"
verify_files_exist("5e5c1c1ce4b01d50924f27e7", metadata_01)
# verify_data_and_metadata_match(metadata_01)

# 02_Observations
metadata_02 <- "in_text/text_02_observations.yml"
verify_files_exist("5e5d0b68e4b01d50924f2b32", metadata_02)
# verify_data_and_metadata_match(metadata_02)

# 03_Config
metadata_03 <- "in_text/text_03_config.yml"
verify_files_exist("5e5c1c36e4b01d50924f27ea", metadata_03) # slight naming difference
# verify_data_and_metadata_match(metadata_03)

# 04_inputs
metadata_04 <- "in_text/text_04_inputs.yml"
verify_files_exist("5e5d0b96e4b01d50924f2b34", metadata_04) # No matches
# verify_data_and_metadata_match(metadata_04)

# 05_observations
metadata_05 <- "in_text/text_05_predictions.yml"
verify_files_exist("5e5d0bb9e4b01d50924f2b36", metadata_05) # No matches
# verify_data_and_metadata_match(metadata_05)

# 06_evaluation
metadata_06 <- "in_text/text_06_evaluation.yml"
verify_files_exist("5e774324e4b01d509270e29f", metadata_06)
# verify_data_and_metadata_match(metadata_06)

# 07_Habitat Check
metadata_07 <- "in_text/text_07_habitat.yml"
verify_files_exist("5e774355e4b01d509270e2a1", metadata_07)
verify_data_and_metadata_match(metadata_07)
jordansread commented 3 years ago

I think this request shares some similarities with the attribute skeleton pieces in #53 and asks the question of what is our normal pattern for creating a template for filling in these values while also making sure we've got all of the files and all of the attributes covered?