Assertion on 'name' failed: Contains missing values (element 1). #395

Closed mvankessel-EMC closed 1 month ago

mvankessel-EMC commented 2 months ago

I have a cohort table that is a product of a generated cohort table. And adjusted with various mutate(), inner_join(), and group_by() calls (it is ungrouped at the end).


#> # Source:   table<og_031_1720533176> [5 x 4]
#> # Database: DuckDB v0.10.2 [mvankessel@Windows 10 x64:R 4.4.0/D:\R-Study-Packages\some_study\dev\test.duckdb]
#>   cohort_definition_id subject_id cohort_start_date cohort_end_date
#>                  <int>      <dbl> <date>            <date>         
#> 1                   14         13 2020-06-15        2022-12-30     
#> 2                    7          9 2019-06-11        2024-04-07     
#> 3                   14          9 2019-06-11        2024-04-07     
#> 4                   14         17 2021-06-15        2022-12-30     
#> 5                   14         11 2020-08-18        2023-02-03   

If I try to make an actual cohort table out of it - required by CohortCharacteristics::summariseLargeScaleCharacteristics() - I get the following error:

cdm$stage3_treatments_adjusted <- omopgenerics::newCohortTable(
  table = cdm$stage3_treatments_adjusted

#> Error in insertTable.db_cdm(cdm = tableSource(table), name = name, table = cohortSetRef,  : 
#>  Assertion on 'name' failed: Contains missing values (element 1).

If I collect() the table, insert it into the CDM, and make a cohort table out of it, it works. But that seems a rather tacky work-around, as I have to pull the entire cohort table into memory.

my_cohort_table <- cdm$stage3_treatments_adjusted %>% collect()

cdm <- CDMConnector::insertTable(
  cdm = cdm,
  name = "my_cohort_table",
  table = my_cohort_table

cdm$my_cohort_table <- omopgenerics::newCohortTable(table = cdm$my_cohort_table)

#> # A tibble: 2 × 2
#>  cohort_definition_id cohort_name
#>                  <int> <chr>      
#> 1                    7 cohort_7   
#> 2                   14 cohort_14  

The classes of the table that I want to make a cohort table out of:

#> [1] "cdm_table"             "GeneratedCohortSet"    "tbl_duckdb_connection"
#> [4] "tbl_dbi"               "tbl_sql"               "tbl_lazy"             
#> [7] "tbl"  

Am I just missing something?

catalamarti commented 2 months ago

that's weird, can you reproduce it in your environment? maybe we can setup a call to investigate where is the error @mvankessel-EMC

mvankessel-EMC commented 2 months ago

This is a full reprex. I will send you the files that I'm using in this example.

dbPath <- "./test.duckdb"
cohortPath <- "./cohorts-treatment_patterns/"
tnmDir <- "./TNM_concepts/"

con <- DBI::dbConnect(
  drv = duckdb::duckdb(),
  server = dbPath,
  dbdir = dbPath

cdm <- CDMConnector::cdmFromCon(
  con = con,
  cdmSchema = "main",
  writeSchema = "main"
cohortSet <- readCohortSet(path = cohortPath)
cdm <- generateCohortSet(
  cdm = cdm,
  cohortSet = cohortSet,
  name = "dummy_cohort_table"
getEventCohorts <- function(cohortSet) {
  cohortSet %>%
    dplyr::filter(!startsWith(.data$cohort_name, "stage_")) %>%
    dplyr::select("cohort_definition_id", "cohort_name") %>%
    dplyr::rename(cohortId = "cohort_definition_id", cohortName = "cohort_name") %>%
    dplyr::mutate(type = "event")

getTargetCohorts <- function(events, cohortSet) {
  cohortSet %>%
    dplyr::filter(!.data$cohort_definition_id %in% events$cohortId) %>%
    dplyr::select("cohort_definition_id", "cohort_name") %>%
    dplyr::rename(cohortId = "cohort_definition_id", cohortName = "cohort_name") %>%
    dplyr::mutate(type = "target")

eventCohorts <- cohortSet %>%

targetCohorts <- cohortSet %>%
  getTargetCohorts(events = eventCohorts)

cohortSet <- dplyr::bind_rows(
) %>%

names(cohortSet) <- tolower(names(cohortSet))

cdm <- CDMConnector::insertTable(
  cdm = cdm,
  name = "cohort_set",
  table = cohortSet

tnmConceptTable <- lapply(list.files(tnmDir, full.names = TRUE), function(file) {
  tbl <- read.csv(file)
  tbl <- tbl[, c("Id", "Code")]
  tbl$tnm_type <- strsplit(basename(file), "\\.")[[1]][1]
}) |>
  dplyr::bind_rows() |>
  dplyr::rename(concept_id = "Id", code = "Code")

cdm <- CDMConnector::insertTable(
  cdm = cdm,
  name = "tnm_concept_table",
  table = tnmConceptTable

cdm$dummy_cohort_table <- cdm$dummy_cohort_table %>%
  dplyr::inner_join(cdm$cohort_set, dplyr::join_by(cohort_definition_id == cohortid)) %>%

cdm$nsclc_cohort_table <- cdm$dummy_cohort_table %>%
  dplyr::filter(.data$type == "target") %>%

cdm$treatment_cohort_table <- cdm$dummy_cohort_table %>%
  dplyr::filter(.data$type == "event") %>%

updateTreatmentDates <- function(
    TNMs = c("TNM-M0", "TNM-M1", "TNM-N2", "TNM-N3", "TNM-T3_t4")) {
  cdm[[treatmentCohortTableName]] %>%
    dplyr::filter(.data$cohort_definition_id == cohortId) %>%
    dplyr::inner_join(cdm$treatment_cohort_table, dplyr::join_by(subject_id == subject_id)) %>%
    dplyr::select("cohort_definition_id.y", "subject_id", "cohort_start_date.y", "cohort_end_date.y") %>%
      cohort_definition_id = "cohort_definition_id.y",
      cohort_start_date = "cohort_start_date.y",
      cohort_end_date = "cohort_end_date.y"
    ) %>%
    dplyr::inner_join(cdm$measurement, dplyr::join_by(subject_id == person_id)) %>%
    dplyr::inner_join(cdm$tnm_concept_table, dplyr::join_by(measurement_concept_id == concept_id)) %>%
    dplyr::filter(.data$tnm_type %in% TNMs) %>%
    dplyr::mutate(date_diff = !!CDMConnector::datediff(end = "measurement_date", "cohort_start_date")) %>%
    dplyr::group_by(.data$cohort_definition_id, .data$subject_id) %>%
      .data$date_diff == min(.data$date_diff, na.rm = TRUE),
      row_number() == 1
    ) %>%
    dplyr::mutate(new_cohort_start_date = dplyr::case_when(
      .data$date_diff <= 0 ~ as.Date(.data$measurement_date)
    )) %>%
    dplyr::select("cohort_definition_id", "subject_id", "new_cohort_start_date", "cohort_end_date") %>%
    dplyr::rename(cohort_start_date = "new_cohort_start_date") %>%

cdm$stage3_treatments_adjusted <- cdm %>%
    cohortId = 19,
    treatmentCohortTableName = "nsclc_cohort_table",
    TNMs = c("TNM-M0", "TNM-N2", "TNM-N3", "TNM-T3_t4")
  ) %>%

    cohort = cdm$stage3_treatments_adjusted,
    eventInWindow = c("drug_exposure")
}, error = function(e) {
#> <simpleError in UseMethod("settings"): no applicable method for 'settings' applied to an object of class "c('cdm_table', 'GeneratedCohortSet', 'tbl_duckdb_connection', 'tbl_dbi', 'tbl_sql', 'tbl_lazy', 'tbl')">

  cdm$stage3_treatments_adjusted <- omopgenerics::newCohortTable(
    table = cdm$stage3_treatments_adjusted
}, error = function(e) {
#> <simpleError in insertTable.db_cdm(cdm = tableSource(table), name = name, table = cohortSetRef,     overwrite = TRUE): Assertion on 'name' failed: Contains missing values (element 1).>

#> [1] "cdm_table"             "GeneratedCohortSet"    "tbl_duckdb_connection"
#> [4] "tbl_dbi"               "tbl_sql"               "tbl_lazy"             
#> [7] "tbl"

Created on 2024-07-12 with reprex v2.1.0

Session info ``` r sessioninfo::session_info() #> ─ Session info ─────────────────────────────────────────────────────────────── #> setting value #> version R version 4.4.0 (2024-04-24 ucrt) #> os Windows 11 x64 (build 22631) #> system x86_64, mingw32 #> ui RTerm #> language (EN) #> collate Dutch_Netherlands.utf8 #> ctype Dutch_Netherlands.utf8 #> tz Europe/Amsterdam #> date 2024-07-12 #> pandoc 3.1.11 @ C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
ablack3 commented 2 months ago

I think this bit of code in omopgenerics needs a second look:

populateCohortSet <- function(table, cohortSetRef) {
  if (is.null(cohortSetRef)) {
    cohortSetRef <- defaultCohortSet(table)
  } else {
    cohortSetRef <- cohortSetRef |> dplyr::collect()
  cohortName <- tableName(table)
  assertClass(cohortSetRef, "data.frame", null = TRUE)
  cohortSetRef <- dplyr::as_tibble(cohortSetRef)
  name <- ifelse(is.na(cohortName), cohortName, paste0(cohortName, "_set"))
  cohortSetRef <- insertTable(
    cdm = tableSource(table), name = name, table = cohortSetRef,
    overwrite = TRUE

If the cohortName is NA the it is still being passed to insertTable

What is the cohortName attribute when someone calls compute with temporary=T on a cdm table? NA_character

Maybe we give an error if the table name is NA (indicating a temp table).

con <- DBI::dbConnect(duckdb::duckdb(), eunomia_dir())
cdm <- cdm_from_con(con, "main", "main")

cs <- read_cohort_set(system.file("cohorts2", package = "CDMConnector"))

cdm <- generate_cohort_set(cdm, cs)

tbl <- cdm$cohort %>% 
  dplyr::filter(subject_id %in% c(951L, 2164L)) %>% 
  compute(temporary = T) 

attr(tbl, "tbl_name")
#> [1] NA

class(attr(tbl, "tbl_name"))
#> [1] "character"

is.na(attr(tbl, "tbl_name"))
#> [1] TRUE

tbl %>% 
#> Error in insertTable.db_cdm(cdm = tableSource(table), name = name, table = cohortSetRef, : Assertion on 'name' failed: Contains missing values (element 1).


Created on 2024-07-16 with reprex v2.1.0

The error is because the tbl_name attribute is NA

mvankessel-EMC commented 2 months ago

A work around for this would be:

# Strip "GeneratedCohortSet" from class attribute
class(cdm$my_cohort_table) <- c("cdm_table", "tbl_duckdb_connection", "tbl_dbi", "tbl_sql", "tbl_lazy", "tbl")

# Make new cohort table
cdm$my_cohort_table<- omopgenerics::newCohortTable(table = cdm$my_cohort_table)
catalamarti commented 2 months ago
