129 use temptable and exists

GEMINI-Medicine / Rgemini

A custom R package that provides a variety of functions to perform data analyses with GEMINI data

Other

3 stars 0 forks source link

##This script pushes the most updated rxnorm mapping into the sed_hyno_mapping file library(Rgemini) library(GEMINIpkg) library(RPostgreSQL) library(lubridate) library(data.table) library(getPass) library(dplyr) library(tictoc) library(stringr) source("~/git_hub/Rgemini/R/utils.R") # Load DB con drv <- dbDriver("PostgreSQL") data.version <- "DB_version here" db <- "db connection here" adm<-dbGetQuery(db,"Select genc_id from admdad where hospital_id in ('hosp_1','hosp_2');")%>%data.table() source("~/git_hub/Rgemini/R/episodes_of_care.R") tic() A<-evalq(episodes_of_care(db,restricted_cohort = adm), envir = .GlobalEnv) toc() tic() B<-Rgemini::episodes_of_care(db,restricted_cohort = adm) toc() #Slightly slower with this implementation but it should scale up #Confirm they are the same tables #test readmission differences source("~/git_hub/Rgemini/R/episodes_of_care.R") tic() A<-evalq(readmission(db,restricted_cohort = adm), envir = .GlobalEnv) toc() #29.139 seconds tic() B<-Rgemini::readmission(db,restricted_cohort = adm) toc() #31.707 seconds delta<-anti_join(A,B) #Joining with `by = join_by(genc_id, AT_in_occurred, AT_out_occurred, epicare, readmit7, readmit30)` nrow(delta) #No difference #test loop_mlaps adm_msh<-dbGetQuery(db,"Select genc_id from admdad where hospital_id in (''hosp_1);")%>%data.table() source("~/git_hub/Rgemini/R/mlaps.R") tic() A<-evalq(loop_mlaps(db,cohort = adm_msh), envir = .GlobalEnv) toc() #297.682 sec elapsed tic() B<-Rgemini::loop_mlaps(db,cohort = adm_msh) toc() #116.039 sec elapsed #test_diff delta<-anti_join(A,B) #Joining with `by = join_by(genc_id, mlaps)` nrow(delta) # [1] 0 #rest loop_maps with more genc_id adm<-dbGetQuery(db,"Select genc_id from admdad where hospital_id in (''hosp_1','hosp_2',''hosp_3e');")%>%data.table() source("~/git_hub/Rgemini/R/mlaps.R") tic() A<-evalq(loop_mlaps(db,cohort = adm), envir = .GlobalEnv) toc() #266.906 sec elapsed tic() B<-Rgemini::loop_mlaps(db,cohort = adm) toc() #832.176 sec elapsed #test_diff delta<-anti_join(A,B) #Joining with `by = join_by(genc_id, mlaps)` nrow(delta) # [1] 0

Hey @Trong-SMH, I implemented similar changes in n_imaging(), n_routine_bloodwork(), and n_rbc_transfusions(). As with the other functions, the speed improvement is mainly noticeable when running these functions on the whole DB (when subsetting the cohort, the old version of the code is faster - maybe we can dig into that at a later point, but for now I optimized things to be run on the whole DB). Would you be able to take a final quick look at the changes I made (mostly in this commit: https://github.com/GEMINI-Medicine/Rgemini/pull/145/commits/09c10f1ce6b675b80b0fe966bfd3f6f3276592d5)? Once everything looks good to you, I can go ahead and merge the branch, update the package version, and notify the team. Thanks!

Here is the testing script I used:

library(Rgemini)
library(GEMINIpkg)
library(RPostgreSQL)
library(lubridate)
library(data.table)
library(getPass)
library(dplyr)
library(stringr)

source("~/GitHub/Rgemini/R/utils.R")

# Load DB con
drv <- dbDriver("PostgreSQL")
db <-DBI::dbConnect(drv,
                    dbname="report_db_v3", #report_db_v3   drm_cleandb_v2_1_3
                    host="prime.smh.gemini-hpc.ca",
                    port=5432,
                    user = "lofflera",
                    password = getPass("Enter Password:"))

adm<-dbGetQuery(db, "Select genc_id, hospital_id from admdad;") %>% data.table()

##### n imaging #####
source("~/GitHub/Rgemini/R/n_imaging.R")
tictoc::tic()
A<-evalq(n_imaging(db, cohort = adm, exclude_ed = FALSE), envir = .GlobalEnv)
tictoc::toc()
# 20 sec elapsed (20 with exclude_ed = TRUE)

tictoc::tic()
B<-Rgemini::n_imaging(db, cohort = adm, exclude_ed = FALSE)
tictoc::toc()
# 102 sec elapsed (71 with exclude_ed = TRUE)

#test_diff
delta<-anti_join(A, B)
nrow(delta)
# [1] 0

##### n routine bloodwork #####
source("~/GitHub/Rgemini/R/n_routine_bloodwork.R")
tictoc::tic()
A<-evalq(n_routine_bloodwork(db, cohort = adm, exclude_ed = FALSE), envir = .GlobalEnv)
tictoc::toc()
# 80 sec elapsed (100 with exclude_ed = TRUE)

tictoc::tic()
B<-Rgemini::n_routine_bloodwork(db, cohort = adm, exclude_ed = FALSE)
tictoc::toc()
# 1120 sec elapsed (152 with exclude_ed = TRUE)

#test_diff
delta<-anti_join(A,B)
nrow(delta)
# [1] 0

##### n RBC transfusions #####
adm$hospital_num <- adm$hospital_id
source("~/GitHub/Rgemini/R/n_rbc_transfusions.R")
tictoc::tic()
A<-evalq(n_rbc_transfusions(db, cohort = adm, exclude_ed = FALSE), envir = .GlobalEnv)
tictoc::toc()
# 110 sec elapsed (130 with exclude_ed = TRUE)

tictoc::tic()
B<-Rgemini::n_rbc_transfusions(db, cohort = adm, exclude_ed = FALSE)
tictoc::toc()
# 280 sec elapsed (410 with exclude_ed = TRUE)

#test_diff
delta<-anti_join(A,B)
nrow(delta)
# [1] 0

GEMINI-Medicine / Rgemini

129 use temptable and exists #145