darwin-eu-dev / PatientProfiles

https://darwin-eu-dev.github.io/PatientProfiles/
Apache License 2.0
7 stars 5 forks source link

Different results each time when running SummariseResult() #706

Open martaalcalde opened 1 week ago

martaalcalde commented 1 week ago

Hi! I'm using summariseResult to count the number of records of an omopTable within specific intervals in time. However, I keep obtaining different results when using summariseResult().

I have encountered this issue when running this function in a Macbook, but not in a windows.

# Functions ----
cdmEunomia <- function() {
  con <- DBI::dbConnect(duckdb::duckdb(), ":memory:")
  schema <- c(schema = "main")
  conDuck <- DBI::dbConnect(duckdb::duckdb(), CDMConnector::eunomia_dir())
  cdmDuck <- CDMConnector::cdmFromCon(
    con = conDuck, cdmSchema = "main", writeSchema = "main"
  )
  cdm <- CDMConnector::copyCdmTo(con = con, cdm = cdmDuck, schema = schema)
  CDMConnector::cdmDisconnect(cdm = cdmDuck)
  return(cdm)
}
getOmopTableStartDate <- function(omopTable, date){
  omopTable |>
    dplyr::summarise("start_date" = min(.data[[date]], na.rm = TRUE)) |>
    dplyr::collect() |>
    dplyr::mutate("start_date" = as.Date(paste0(clock::get_year(.data$start_date),"-01-01"))) |>
    dplyr::pull("start_date")
}
getOmopTableEndDate   <- function(omopTable, date){
  omopTable |>
    dplyr::summarise("end_date" = max(.data[[date]], na.rm = TRUE)) |>
    dplyr::collect() |>
    dplyr::mutate("end_date" = as.Date(paste0(clock::get_year(.data$end_date),"-12-31"))) |>
    dplyr::pull("end_date")
}
getIntervalTibble <- function(omopTable, start_date_name, end_date_name, unit, unitInterval){
  startDate <- getOmopTableStartDate(omopTable, start_date_name)
  endDate   <- getOmopTableEndDate(omopTable, end_date_name)

  tibble::tibble(
    "group" = seq.Date(as.Date(startDate), as.Date(endDate), "month")
  ) |>
    dplyr::rowwise() |>
    dplyr::mutate("interval" = max(which(
      .data$group >= seq.Date(from = startDate, to = endDate, by = paste(.env$unitInterval, .env$unit))
    ),
    na.rm = TRUE)) |>
    dplyr::ungroup() |>
    dplyr::group_by(.data$interval) |>
    dplyr::mutate(
      "interval_start_date" = min(.data$group),
      "interval_end_date"   = dplyr::if_else(.env$unit == "year",
                                             clock::add_years(min(.data$group),.env$unitInterval)-1,
                                             clock::add_months(min(.data$group),.env$unitInterval)-1)
    ) |>
    dplyr::mutate(
      "interval_start_date" = as.Date(.data$interval_start_date),
      "interval_end_date" = as.Date(.data$interval_end_date)
    ) |>
    dplyr::mutate(
      "interval_group" = paste(.data$interval_start_date,"to",.data$interval_end_date)
    ) |>
    dplyr::ungroup() |>
    dplyr::mutate("my" = paste0(clock::get_month(.data$group),"-",clock::get_year(.data$group))) |>
    dplyr::select("interval_group", "my", "interval_start_date","interval_end_date") |>
    dplyr::distinct()
}
splitIncidenceBetweenIntervals <- function(cdm, omopTable, date, strata){
  cdm$interval |>
    dplyr::inner_join(
      omopTable |>
        dplyr::rename("incidence_date" = dplyr::all_of(.env$date)) |>
        dplyr::mutate("my" = paste0(clock::get_month(.data$incidence_date),"-",clock::get_year(.data$incidence_date))),
      by = "my"
    ) |>
    dplyr::select(-c("my")) |>
    dplyr::relocate("person_id") |>
    dplyr::select(-c("interval_start_date", "interval_end_date", "incidence_date", "person_id"))
}

# Bespoke code -----
cdm <- cdmEunomia()
#> Note: method with signature 'DBIConnection#Id' chosen for function 'dbExistsTable',
#>  target signature 'duckdb_connection#Id'.
#>  "duckdb_connection#ANY" would also be valid
omopTable <- cdm[["condition_occurrence"]] |>
  dplyr::ungroup() |>
  dplyr::select("person_id", "condition_start_date") |>
  PatientProfiles::addAgeQuery(indexDate = "condition_start_date",
                               ageGroup = list("<=20" = c(0,20), "21 to 40" = c(21,40), "41 to 60" = c(41,60)))

interval <- getIntervalTibble(omopTable, "condition_start_date", "condition_start_date", "year", 1)
cdm <- cdm |>
  omopgenerics::insertTable(name = "interval", table = interval)

result <- splitIncidenceBetweenIntervals(cdm, omopTable, "condition_start_date", "age_group")

for(i in 1:5){
  t <- PatientProfiles::summariseResult(result,
                                        includeOverallStrata = TRUE,
                                        strata = list("age_group"),
                                        estimates = "count",
                                        counts = FALSE) |>
    suppressMessages()
  print(t |> dplyr::filter(variable_level == "1920-01-01 to 1920-12-31", strata_level == "<=20") |> dplyr::pull("estimate_value"))
}
#> [1] "50"
#> character(0)
#> [1] "50"
#> [1] "50"
#> [1] "50"

Created on 2024-10-09 with reprex v2.1.1

martaalcalde commented 1 week ago

I think it may only happen when "result" is not collected locally, as when adding dplyr::collect() in line 84, this does not happen again:

# Functions ----
cdmEunomia <- function() {
  con <- DBI::dbConnect(duckdb::duckdb(), ":memory:")
  schema <- c(schema = "main")
  conDuck <- DBI::dbConnect(duckdb::duckdb(), CDMConnector::eunomia_dir())
  cdmDuck <- CDMConnector::cdmFromCon(
    con = conDuck, cdmSchema = "main", writeSchema = "main"
  )
  cdm <- CDMConnector::copyCdmTo(con = con, cdm = cdmDuck, schema = schema)
  CDMConnector::cdmDisconnect(cdm = cdmDuck)
  return(cdm)
}
getOmopTableStartDate <- function(omopTable, date){
  omopTable |>
    dplyr::summarise("start_date" = min(.data[[date]], na.rm = TRUE)) |>
    dplyr::collect() |>
    dplyr::mutate("start_date" = as.Date(paste0(clock::get_year(.data$start_date),"-01-01"))) |>
    dplyr::pull("start_date")
}
getOmopTableEndDate   <- function(omopTable, date){
  omopTable |>
    dplyr::summarise("end_date" = max(.data[[date]], na.rm = TRUE)) |>
    dplyr::collect() |>
    dplyr::mutate("end_date" = as.Date(paste0(clock::get_year(.data$end_date),"-12-31"))) |>
    dplyr::pull("end_date")
}
getIntervalTibble <- function(omopTable, start_date_name, end_date_name, unit, unitInterval){
  startDate <- getOmopTableStartDate(omopTable, start_date_name)
  endDate   <- getOmopTableEndDate(omopTable, end_date_name)

  tibble::tibble(
    "group" = seq.Date(as.Date(startDate), as.Date(endDate), "month")
  ) |>
    dplyr::rowwise() |>
    dplyr::mutate("interval" = max(which(
      .data$group >= seq.Date(from = startDate, to = endDate, by = paste(.env$unitInterval, .env$unit))
    ),
    na.rm = TRUE)) |>
    dplyr::ungroup() |>
    dplyr::group_by(.data$interval) |>
    dplyr::mutate(
      "interval_start_date" = min(.data$group),
      "interval_end_date"   = dplyr::if_else(.env$unit == "year",
                                             clock::add_years(min(.data$group),.env$unitInterval)-1,
                                             clock::add_months(min(.data$group),.env$unitInterval)-1)
    ) |>
    dplyr::mutate(
      "interval_start_date" = as.Date(.data$interval_start_date),
      "interval_end_date" = as.Date(.data$interval_end_date)
    ) |>
    dplyr::mutate(
      "interval_group" = paste(.data$interval_start_date,"to",.data$interval_end_date)
    ) |>
    dplyr::ungroup() |>
    dplyr::mutate("my" = paste0(clock::get_month(.data$group),"-",clock::get_year(.data$group))) |>
    dplyr::select("interval_group", "my", "interval_start_date","interval_end_date") |>
    dplyr::distinct()
}
splitIncidenceBetweenIntervals <- function(cdm, omopTable, date, strata){
  cdm$interval |>
    dplyr::inner_join(
      omopTable |>
        dplyr::rename("incidence_date" = dplyr::all_of(.env$date)) |>
        dplyr::mutate("my" = paste0(clock::get_month(.data$incidence_date),"-",clock::get_year(.data$incidence_date))),
      by = "my"
    ) |>
    dplyr::select(-c("my")) |>
    dplyr::relocate("person_id") |>
    dplyr::select(-c("interval_start_date", "interval_end_date", "incidence_date", "person_id"))
}

# Bespoke code -----
cdm <- cdmEunomia()
#> Note: method with signature 'DBIConnection#Id' chosen for function 'dbExistsTable',
#>  target signature 'duckdb_connection#Id'.
#>  "duckdb_connection#ANY" would also be valid
omopTable <- cdm[["condition_occurrence"]] |>
  dplyr::ungroup() |>
  dplyr::select("person_id", "condition_start_date") |>
  PatientProfiles::addAgeQuery(indexDate = "condition_start_date",
                               ageGroup = list("<=20" = c(0,20), "21 to 40" = c(21,40), "41 to 60" = c(41,60)))

interval <- getIntervalTibble(omopTable, "condition_start_date", "condition_start_date", "year", 1)
cdm <- cdm |>
  omopgenerics::insertTable(name = "interval", table = interval)

result <- splitIncidenceBetweenIntervals(cdm, omopTable, "condition_start_date", "age_group") |> dplyr::collect()

for(i in 1:50){
  t <- PatientProfiles::summariseResult(result,
                                        includeOverallStrata = TRUE,
                                        strata = list("age_group"),
                                        estimates = "count",
                                        counts = FALSE) |>
    suppressMessages()
  print(t |> dplyr::filter(variable_level == "1920-01-01 to 1920-12-31", strata_level == "<=20") |> dplyr::pull("estimate_value"))
}
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"
#> [1] "50"

Created on 2024-10-10 with reprex v2.1.1