ctsit / redcapcustodian

Simplified, automated data management on REDCap systems
Other
12 stars 6 forks source link

Add redcap-specific DB comparison tools from rcc.ctsit #145

Open pbchase opened 5 months ago

pbchase commented 5 months ago

One of CTS-IT's internal projects built upon https://github.com/ctsit/redcapcustodian/blob/master/report/compare_two_mysql_dbs.R adding functions to compare the final rows of select REDCap tables. This allows a rapid test of sameness on these high-traffic, add-only, time-stamped tables. The original code is https://github.com/ctsit/rcc.ctsit/blob/master/report/compare_two_mysql_dbs.R. A copy is included here

library(redcapcustodian)
library(RMariaDB)
library(DBI)
library(tidyverse)
library(dotenv)
library(lubridate)

load_dot_env("prod_migrate.env")
if (Sys.getenv("REDCAP_DB_NAME") == "") dotenv::load_dot_env(here::here(".env"))

init_etl("compare_two_mysql_dbs")

rc_source <- connect_to_redcap_db()
rc_target <- connect_to_db(
  drv = RMariaDB::MariaDB(),
  prefix = "TARGETREDCAP"
)

# Check log tables first
# check last redcap_log_event entries
get_last_log_event <- function(table, conn) {
  result <- tbl(conn, table) |>
    select(1, any_of(c("ts", "stored_date", "cron_run_start"))) |>
    rename(
      id = 1,
      ts = 2
    ) |>
    arrange(desc(id)) |>
    head(n = 1) |>
    collect() |>
    mutate(ts = ymd_hms(ts)) |>
    mutate(
      table = table,
      host = dbGetInfo(conn)$host
    )

  return(result)
}

get_last_log_event("redcap_log_event", rc_source)

time_stamped_tables <- c(
  redcapcustodian::log_event_tables,
  "redcap_log_view",
  "redcap_edocs_metadata",
  "redcap_crons_history"
  )

last_log_events <-
  bind_rows(
    purrr::map_df(time_stamped_tables, get_last_log_event, rc_source) |>
      mutate(copy = "source"),
    purrr::map_df(time_stamped_tables, get_last_log_event, rc_target) |>
      mutate(copy = "target")
  )

ts_diff <- last_log_events |>
  pivot_wider(
    id_cols = c("table"),
    names_from = "copy",
    values_from = c("id", "ts")
  ) |>
  mutate(matches = ts_source == ts_target) |>
  select(-starts_with("log_event_id")) |>
  mutate(time_diff = ts_source - ts_target)

# What does the diff look like?
ts_diff

# example tables to compare
my_example_tables <- c(
  "redcap_entity_project_ownership",
  "redcap_user_information",
  "redcap_projects",
  "redcap_auth",
  "redcap_ip_banned"
)

# Prepare to compute the script's elapsed run time
start <- now()

source_tables <- dbListTables(rc_source)
# Uncomment the next line to run a quick test against a REDCap host.
#   Testing a full redcap database can several minutes
# source_tables <- my_example_tables
source_checksums <- purrr::map_df(source_tables, get_table_checksum, rc_source)

target_tables <- dbListTables(rc_target)
# Uncomment the next line to run a quick test against a REDCap host.
#   Testing a full redcap database can several minutes
# target_tables <- my_example_tables
target_checksums <- purrr::map_df(target_tables, get_table_checksum, rc_target)

# compare the source and target data
checksums <- evaluate_checksums(source_checksums, target_checksums)

# save our work
checksums |> write_csv(here::here("output", "checksums.csv"))

# report on the checksum matching
checksums |> count(matches)
checksums |> filter(!matches)

# this will always contain the views
checksums |> filter(is.na(matches))

# compute elapsed run time
finish <- now()
finish - start

dbDisconnect(rc_source)
dbDisconnect(rc_target)