Open pbchase opened 5 months ago
One of CTS-IT's internal projects built upon https://github.com/ctsit/redcapcustodian/blob/master/report/compare_two_mysql_dbs.R adding functions to compare the final rows of select REDCap tables. This allows a rapid test of sameness on these high-traffic, add-only, time-stamped tables. The original code is https://github.com/ctsit/rcc.ctsit/blob/master/report/compare_two_mysql_dbs.R. A copy is included here
library(redcapcustodian) library(RMariaDB) library(DBI) library(tidyverse) library(dotenv) library(lubridate) load_dot_env("prod_migrate.env") if (Sys.getenv("REDCAP_DB_NAME") == "") dotenv::load_dot_env(here::here(".env")) init_etl("compare_two_mysql_dbs") rc_source <- connect_to_redcap_db() rc_target <- connect_to_db( drv = RMariaDB::MariaDB(), prefix = "TARGETREDCAP" ) # Check log tables first # check last redcap_log_event entries get_last_log_event <- function(table, conn) { result <- tbl(conn, table) |> select(1, any_of(c("ts", "stored_date", "cron_run_start"))) |> rename( id = 1, ts = 2 ) |> arrange(desc(id)) |> head(n = 1) |> collect() |> mutate(ts = ymd_hms(ts)) |> mutate( table = table, host = dbGetInfo(conn)$host ) return(result) } get_last_log_event("redcap_log_event", rc_source) time_stamped_tables <- c( redcapcustodian::log_event_tables, "redcap_log_view", "redcap_edocs_metadata", "redcap_crons_history" ) last_log_events <- bind_rows( purrr::map_df(time_stamped_tables, get_last_log_event, rc_source) |> mutate(copy = "source"), purrr::map_df(time_stamped_tables, get_last_log_event, rc_target) |> mutate(copy = "target") ) ts_diff <- last_log_events |> pivot_wider( id_cols = c("table"), names_from = "copy", values_from = c("id", "ts") ) |> mutate(matches = ts_source == ts_target) |> select(-starts_with("log_event_id")) |> mutate(time_diff = ts_source - ts_target) # What does the diff look like? ts_diff # example tables to compare my_example_tables <- c( "redcap_entity_project_ownership", "redcap_user_information", "redcap_projects", "redcap_auth", "redcap_ip_banned" ) # Prepare to compute the script's elapsed run time start <- now() source_tables <- dbListTables(rc_source) # Uncomment the next line to run a quick test against a REDCap host. # Testing a full redcap database can several minutes # source_tables <- my_example_tables source_checksums <- purrr::map_df(source_tables, get_table_checksum, rc_source) target_tables <- dbListTables(rc_target) # Uncomment the next line to run a quick test against a REDCap host. # Testing a full redcap database can several minutes # target_tables <- my_example_tables target_checksums <- purrr::map_df(target_tables, get_table_checksum, rc_target) # compare the source and target data checksums <- evaluate_checksums(source_checksums, target_checksums) # save our work checksums |> write_csv(here::here("output", "checksums.csv")) # report on the checksum matching checksums |> count(matches) checksums |> filter(!matches) # this will always contain the views checksums |> filter(is.na(matches)) # compute elapsed run time finish <- now() finish - start dbDisconnect(rc_source) dbDisconnect(rc_target)
One of CTS-IT's internal projects built upon https://github.com/ctsit/redcapcustodian/blob/master/report/compare_two_mysql_dbs.R adding functions to compare the final rows of select REDCap tables. This allows a rapid test of sameness on these high-traffic, add-only, time-stamped tables. The original code is https://github.com/ctsit/rcc.ctsit/blob/master/report/compare_two_mysql_dbs.R. A copy is included here