Closed mvankessel-EMC closed 1 month ago
that's weird, can you reproduce it in your environment? maybe we can setup a call to investigate where is the error @mvankessel-EMC
This is a full reprex. I will send you the files that I'm using in this example.
library(CDMConnector)
#> Warning: package 'CDMConnector' was built under R version 4.4.1
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(omopgenerics)
#> Warning: package 'omopgenerics' was built under R version 4.4.1
#>
#> Attaching package: 'omopgenerics'
#> The following objects are masked from 'package:CDMConnector':
#>
#> cdmName, recordCohortAttrition, uniqueTableName
#> The following object is masked from 'package:stats':
#>
#> filter
library(CohortCharacteristics)
#> Warning: package 'CohortCharacteristics' was built under R version 4.4.1
library(PatientProfiles)
#> Warning: package 'PatientProfiles' was built under R version 4.4.1
dbPath <- "./test.duckdb"
cohortPath <- "./cohorts-treatment_patterns/"
tnmDir <- "./TNM_concepts/"
con <- DBI::dbConnect(
drv = duckdb::duckdb(),
server = dbPath,
dbdir = dbPath
)
cdm <- CDMConnector::cdmFromCon(
con = con,
cdmSchema = "main",
writeSchema = "main"
)
#> Note: method with signature 'DBIConnection#Id' chosen for function 'dbExistsTable',
#> target signature 'duckdb_connection#Id'.
#> "duckdb_connection#ANY" would also be valid
#> ! cdm name not specified and could not be inferred from the cdm source table
cohortSet <- readCohortSet(path = cohortPath)
cdm <- generateCohortSet(
cdm = cdm,
cohortSet = cohortSet,
name = "dummy_cohort_table"
)
#> ℹ Generating 15 cohorts
#> ℹ Generating cohort (1/15) - atezolizumab✔ Generating cohort (1/15) - atezolizumab [1.4s]
#> ℹ Generating cohort (2/15) - carboplatin✔ Generating cohort (2/15) - carboplatin [1.3s]
#> ℹ Generating cohort (3/15) - cemiplimab✔ Generating cohort (3/15) - cemiplimab [235ms]
#> ℹ Generating cohort (4/15) - cisplatin✔ Generating cohort (4/15) - cisplatin [2s]
#> ℹ Generating cohort (5/15) - docetaxel✔ Generating cohort (5/15) - docetaxel [1.4s]
#> ℹ Generating cohort (6/15) - durvalumab✔ Generating cohort (6/15) - durvalumab [1s]
#> ℹ Generating cohort (7/15) - gemcitabine✔ Generating cohort (7/15) - gemcitabine [1s]
#> ℹ Generating cohort (8/15) - ipilimumab✔ Generating cohort (8/15) - ipilimumab [1.5s]
#> ℹ Generating cohort (9/15) - nivolumab✔ Generating cohort (9/15) - nivolumab [1.6s]
#> ℹ Generating cohort (10/15) - paclitaxel✔ Generating cohort (10/15) - paclitaxel [1.6s]
#> ℹ Generating cohort (11/15) - pembrolizumab✔ Generating cohort (11/15) - pembrolizumab [1.5s]
#> ℹ Generating cohort (12/15) - pemetrexed✔ Generating cohort (12/15) - pemetrexed [1s]
#> ℹ Generating cohort (13/15) - stage_3b_4_2m✔ Generating cohort (13/15) - stage_3b_4_2m [1.2s]
#> ℹ Generating cohort (14/15) - stage_3b_4_2m_prior_lung_cancer_allowed✔ Generating cohort (14/15) - stage_3b_4_2m_prior_lung_cancer_allowed [3.2s]
#> ℹ Generating cohort (15/15) - vinorelbine✔ Generating cohort (15/15) - vinorelbine [700ms]
getEventCohorts <- function(cohortSet) {
cohortSet %>%
dplyr::filter(!startsWith(.data$cohort_name, "stage_")) %>%
dplyr::select("cohort_definition_id", "cohort_name") %>%
dplyr::rename(cohortId = "cohort_definition_id", cohortName = "cohort_name") %>%
dplyr::mutate(type = "event")
}
getTargetCohorts <- function(events, cohortSet) {
cohortSet %>%
dplyr::filter(!.data$cohort_definition_id %in% events$cohortId) %>%
dplyr::select("cohort_definition_id", "cohort_name") %>%
dplyr::rename(cohortId = "cohort_definition_id", cohortName = "cohort_name") %>%
dplyr::mutate(type = "target")
}
eventCohorts <- cohortSet %>%
getEventCohorts()
targetCohorts <- cohortSet %>%
getTargetCohorts(events = eventCohorts)
cohortSet <- dplyr::bind_rows(
eventCohorts,
targetCohorts
) %>%
as.data.frame()
names(cohortSet) <- tolower(names(cohortSet))
cdm <- CDMConnector::insertTable(
cdm = cdm,
name = "cohort_set",
table = cohortSet
)
tnmConceptTable <- lapply(list.files(tnmDir, full.names = TRUE), function(file) {
tbl <- read.csv(file)
tbl <- tbl[, c("Id", "Code")]
tbl$tnm_type <- strsplit(basename(file), "\\.")[[1]][1]
return(tbl)
}) |>
dplyr::bind_rows() |>
dplyr::rename(concept_id = "Id", code = "Code")
cdm <- CDMConnector::insertTable(
cdm = cdm,
name = "tnm_concept_table",
table = tnmConceptTable
)
cdm$dummy_cohort_table <- cdm$dummy_cohort_table %>%
dplyr::inner_join(cdm$cohort_set, dplyr::join_by(cohort_definition_id == cohortid)) %>%
dplyr::compute()
cdm$nsclc_cohort_table <- cdm$dummy_cohort_table %>%
dplyr::filter(.data$type == "target") %>%
dplyr::compute()
cdm$treatment_cohort_table <- cdm$dummy_cohort_table %>%
dplyr::filter(.data$type == "event") %>%
dplyr::compute()
updateTreatmentDates <- function(
cdm,
cohortId,
treatmentCohortTableName,
TNMs = c("TNM-M0", "TNM-M1", "TNM-N2", "TNM-N3", "TNM-T3_t4")) {
cdm[[treatmentCohortTableName]] %>%
dplyr::filter(.data$cohort_definition_id == cohortId) %>%
dplyr::inner_join(cdm$treatment_cohort_table, dplyr::join_by(subject_id == subject_id)) %>%
dplyr::select("cohort_definition_id.y", "subject_id", "cohort_start_date.y", "cohort_end_date.y") %>%
dplyr::rename(
cohort_definition_id = "cohort_definition_id.y",
cohort_start_date = "cohort_start_date.y",
cohort_end_date = "cohort_end_date.y"
) %>%
dplyr::inner_join(cdm$measurement, dplyr::join_by(subject_id == person_id)) %>%
dplyr::inner_join(cdm$tnm_concept_table, dplyr::join_by(measurement_concept_id == concept_id)) %>%
dplyr::filter(.data$tnm_type %in% TNMs) %>%
dplyr::mutate(date_diff = !!CDMConnector::datediff(end = "measurement_date", "cohort_start_date")) %>%
dplyr::group_by(.data$cohort_definition_id, .data$subject_id) %>%
dplyr::filter(
.data$date_diff == min(.data$date_diff, na.rm = TRUE),
row_number() == 1
) %>%
dplyr::mutate(new_cohort_start_date = dplyr::case_when(
.data$date_diff <= 0 ~ as.Date(.data$measurement_date)
)) %>%
dplyr::select("cohort_definition_id", "subject_id", "new_cohort_start_date", "cohort_end_date") %>%
dplyr::rename(cohort_start_date = "new_cohort_start_date") %>%
dplyr::ungroup()
}
cdm$stage3_treatments_adjusted <- cdm %>%
updateTreatmentDates(
cohortId = 19,
treatmentCohortTableName = "nsclc_cohort_table",
TNMs = c("TNM-M0", "TNM-N2", "TNM-N3", "TNM-T3_t4")
) %>%
dplyr::compute()
tryCatch({
CohortCharacteristics::summariseLargeScaleCharacteristics(
cohort = cdm$stage3_treatments_adjusted,
eventInWindow = c("drug_exposure")
)
}, error = function(e) {
print(e)
})
#> ℹ Summarising large scale characteristics
#> - getting characteristics from table drug_exposure (1 of 1)
#> <simpleError in UseMethod("settings"): no applicable method for 'settings' applied to an object of class "c('cdm_table', 'GeneratedCohortSet', 'tbl_duckdb_connection', 'tbl_dbi', 'tbl_sql', 'tbl_lazy', 'tbl')">
tryCatch({
cdm$stage3_treatments_adjusted <- omopgenerics::newCohortTable(
table = cdm$stage3_treatments_adjusted
)
}, error = function(e) {
print(e)
})
#> <simpleError in insertTable.db_cdm(cdm = tableSource(table), name = name, table = cohortSetRef, overwrite = TRUE): Assertion on 'name' failed: Contains missing values (element 1).>
class(cdm$stage3_treatments_adjusted)
#> [1] "cdm_table" "GeneratedCohortSet" "tbl_duckdb_connection"
#> [4] "tbl_dbi" "tbl_sql" "tbl_lazy"
#> [7] "tbl"
Created on 2024-07-12 with reprex v2.1.0
I think this bit of code in omopgenerics needs a second look:
populateCohortSet <- function(table, cohortSetRef) {
if (is.null(cohortSetRef)) {
cohortSetRef <- defaultCohortSet(table)
} else {
cohortSetRef <- cohortSetRef |> dplyr::collect()
}
cohortName <- tableName(table)
assertClass(cohortSetRef, "data.frame", null = TRUE)
cohortSetRef <- dplyr::as_tibble(cohortSetRef)
name <- ifelse(is.na(cohortName), cohortName, paste0(cohortName, "_set"))
cohortSetRef <- insertTable(
cdm = tableSource(table), name = name, table = cohortSetRef,
overwrite = TRUE
)
return(cohortSetRef)
}
If the cohortName is NA the it is still being passed to insertTable
What is the cohortName attribute when someone calls compute with temporary=T on a cdm table? NA_character
Maybe we give an error if the table name is NA (indicating a temp table).
library(CDMConnector)
con <- DBI::dbConnect(duckdb::duckdb(), eunomia_dir())
cdm <- cdm_from_con(con, "main", "main")
cs <- read_cohort_set(system.file("cohorts2", package = "CDMConnector"))
cdm <- generate_cohort_set(cdm, cs)
tbl <- cdm$cohort %>%
dplyr::filter(subject_id %in% c(951L, 2164L)) %>%
compute(temporary = T)
attr(tbl, "tbl_name")
#> [1] NA
class(attr(tbl, "tbl_name"))
#> [1] "character"
is.na(attr(tbl, "tbl_name"))
#> [1] TRUE
tbl %>%
record_cohort_attrition("reason")
#> Error in insertTable.db_cdm(cdm = tableSource(table), name = name, table = cohortSetRef, : Assertion on 'name' failed: Contains missing values (element 1).
cdmDisconnect(cdm)
Created on 2024-07-16 with reprex v2.1.0
The error is because the tbl_name attribute is NA
A work around for this would be:
# Strip "GeneratedCohortSet" from class attribute
class(cdm$my_cohort_table) <- c("cdm_table", "tbl_duckdb_connection", "tbl_dbi", "tbl_sql", "tbl_lazy", "tbl")
# Make new cohort table
cdm$my_cohort_table<- omopgenerics::newCohortTable(table = cdm$my_cohort_table)
I have a cohort table that is a product of a generated cohort table. And adjusted with various
mutate()
,inner_join()
, andgroup_by()
calls (it is ungrouped at the end).If I try to make an actual cohort table out of it - required by
CohortCharacteristics::summariseLargeScaleCharacteristics()
- I get the following error:If I
collect()
the table, insert it into the CDM, and make a cohort table out of it, it works. But that seems a rather tacky work-around, as I have to pull the entire cohort table into memory.The classes of the table that I want to make a cohort table out of:
Am I just missing something?