darwin-eu / PatientProfiles

https://darwin-eu.github.io/PatientProfiles/
Apache License 2.0
8 stars 5 forks source link

Idea: addCharlsonScore function #320

Closed ablack3 closed 11 months ago

ablack3 commented 1 year ago

Might be helpful...

add_charlson_score <- function(x) {
  require(CDMConnector)

  checkmate::assert_class(x, "GeneratedCohortSet")
  checkmate::assert_class(x, "tbl")
  cdm <- attr(x, "cdm_reference")
  checkmate::assert_class(cdm, "cdm_reference")
  con <- attr(cdm, "dbcon")
  checkmate::assert_true(DBI::dbIsValid(con))
  assert_write_schema(cdm)
  write_schema <- attr(cdm, "write_schema")

  assert_tables(cdm, c("concept", "concept_ancestor", "condition_era"))

  charlson_scoring <- tibble::tribble(
    ~diag_category_id, ~diag_category_name, ~weight, ~concept_id,
     1, 'Myocardial infarction',       1, 4329847,
     2, 'Congestive heart failure',    1, 316139,
     3, 'Peripheral vascular disease', 1, 321052,
     4, 'Cerebrovascular disease',     1, c(381591, 434056),
     5, 'Dementia',                    1, 4182210,
     6, 'Chronic pulmonary disease',   1, 4063381,
     7, 'Rheumatologic disease',       1, c(257628, 134442, 80800, 80809, 256197, 255348),
     8, 'Peptic ulcer disease',        1, 4247120,
     9, 'Mild liver disease',          1, c(4064161, 4212540),
    10, 'Diabetes (mild to moderate)', 1, 201820,
    11, 'Diabetes with chronic complications', 2, c(443767, 442793),
    12, 'Hemoplegia or paralegia', 2, c(192606, 374022),
    13, 'Renal disease', 2, 4030518,
    14, 'Any malignancy', 2, 443392,
    15, 'Moderate to severe liver disease', 3, c(4245975, 4029488, 192680, 24966),
    16, 'Metastatic solid tumor', 6, 432851,
    17, 'AIDS', 6, 439727) %>%
    tidyr::unnest(concept_id) %>%
    dplyr::mutate(concept_id = as.integer(concept_id))

  tempname <- paste0("temp", floor(as.numeric(Sys.time())*10) %% 1e6, "_charlson")

  DBI::dbWriteTable(con,
                    inSchema(write_schema, tempname, dbms = dbms(con)),
                    charlson_scoring)

  charlson <- dplyr::tbl(con, inSchema(write_schema, tempname, dbms = dbms(con))) %>%
    dplyr::inner_join(cdm$concept_ancestor, by = c("concept_id" = "ancestor_concept_id")) %>%
    dplyr::select(diag_category_name, concept_id = "descendant_concept_id", "weight")

  conditions <- cdm$condition_era %>%
    dplyr::select(subject_id = "person_id",
                  concept_id = "condition_concept_id",
                  start_date = "condition_era_start_date")

  x %>%
    dplyr::inner_join(conditions, by = "subject_id") %>%
    dplyr::inner_join(charlson, by = "concept_id") %>%
    dplyr::filter(.data$start_date <= .data$cohort_start_date) %>%
    dplyr::select("cohort_definition_id",
                    "cohort_start_date",
                    "subject_id",
                    "diag_category_name",
                    "weight") %>%
    dplyr::distinct() %>%
    dplyr::group_by(.data$cohort_definition_id,
                    .data$cohort_start_date,
                    .data$subject_id) %>%
    dplyr::summarise(charlson_score = sum(weight, na.rm = TRUE), .groups = "drop") %>%
    dplyr::left_join(x, ., by = c("cohort_definition_id",
                                  "cohort_start_date",
                                  "subject_id")) %>%
    dplyr::mutate(charlson_score = coalesce(charlson_score, 0L)) %>%
    computeQuery()
}

library(CDMConnector) # using v1.1 (dev version)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

example_datasets()
#>  [1] "GiBleed"                             "synthea-allergies-10k"              
#>  [3] "synthea-anemia-10k"                  "synthea-breast_cancer-10k"          
#>  [5] "synthea-contraceptives-10k"          "synthea-covid19-10k"                
#>  [7] "synthea-covid19-200k"                "synthea-dermatitis-10k"             
#>  [9] "synthea-heart-10k"                   "synthea-hiv-10k"                    
#> [11] "synthea-lung_cancer-10k"             "synthea-medications-10k"            
#> [13] "synthea-metabolic_syndrome-10k"      "synthea-opioid_addiction-10k"       
#> [15] "synthea-rheumatoid_arthritis-10k"    "synthea-snf-10k"                    
#> [17] "synthea-surgery-10k"                 "synthea-total_joint_replacement-10k"
#> [19] "synthea-veteran_prostate_cancer-10k" "synthea-veterans-10k"               
#> [21] "synthea-weight_loss-10k"

con <- DBI::dbConnect(duckdb::duckdb(), eunomia_dir("synthea-lung_cancer-10k"))
cdm <- cdm_from_con(con, cdm_schema = "main", write_schema = "main")

# look at common conditions to find lung cancer codes
cdm %>%
  cdm_flatten(domain = "condition") %>%
  count(observation_concept_id, observation_concept_name, sort = TRUE)
#> # Source:     SQL [10 x 3]
#> # Database:   DuckDB 0.8.1 [root@Darwin 21.6.0:R 4.2.2//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//RtmpM0xqMs/file1ad18c4b5a0.duckdb]
#> # Ordered by: desc(n)
#>    observation_concept_id observation_concept_name                             n
#>                     <int> <chr>                                            <dbl>
#>  1                 317576 Coronary arteriosclerosis                         1270
#>  2                 381316 Cerebrovascular accident                           958
#>  3                 313217 Atrial fibrillation                                792
#>  4                 321042 Cardiac arrest                                     515
#>  5                4329847 Myocardial infarction                              504
#>  6                 439777 Anemia                                              76
#>  7                4115276 Non-small cell lung cancer                          74
#>  8                4310703 Non-small cell carcinoma of lung, TNM stage 1       74
#>  9                4110591 Small cell carcinoma of lung                        18
#> 10               37395648 Primary small cell malignant neoplasm of lung, …    18

cdm <- generate_concept_cohort_set(cdm,
                                   concept_set = list(lung_cancer = c(4310703,4115276,3739564,4110591)),
                                   name = "cohort",
                                   overwrite = TRUE)
cdm$cohort
#> # Source:   SQL [?? x 4]
#> # Database: DuckDB 0.8.1 [root@Darwin 21.6.0:R 4.2.2//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//RtmpM0xqMs/file1ad18c4b5a0.duckdb]
#>    cohort_definition_id subject_id cohort_start_date cohort_end_date
#>                   <int>      <int> <date>            <date>         
#>  1                    1       3138 2019-10-26        2023-06-09     
#>  2                    1       3262 1996-01-09        2001-01-20     
#>  3                    1       4062 2015-06-14        2018-09-04     
#>  4                    1       5047 2008-11-24        2012-01-04     
#>  5                    1       6468 1989-12-29        1994-06-02     
#>  6                    1       7230 2006-04-14        2011-03-05     
#>  7                    1       8075 2004-04-26        2010-03-23     
#>  8                    1       8267 2015-05-20        2019-03-26     
#>  9                    1      10416 2014-09-03        2019-01-20     
#> 10                    1        269 2007-08-14        2012-09-06     
#> # ℹ more rows

cdm$cohort <- add_charlson_score(cdm$cohort)

cdm$cohort %>%
  select(cohort_definition_id, subject_id, charlson_score)
#> # Source:   SQL [?? x 3]
#> # Database: DuckDB 0.8.1 [root@Darwin 21.6.0:R 4.2.2//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//RtmpM0xqMs/file1ad18c4b5a0.duckdb]
#>    cohort_definition_id subject_id charlson_score
#>                   <int>      <int>          <dbl>
#>  1                    1      10754              2
#>  2                    1      10416              2
#>  3                    1      10031              2
#>  4                    1       1034              2
#>  5                    1      10313              2
#>  6                    1      10193              2
#>  7                    1      10092              2
#>  8                    1       9478              2
#>  9                    1       9365              2
#> 10                    1       9183              2
#> # ℹ more rows

cdm$cohort %>% 
  pull(charlson_score) %>% 
  summary
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>   2.000   2.000   2.000   2.054   2.000   3.000

DBI::dbDisconnect(con, shutdown = TRUE)

Created on 2023-08-16 with reprex v2.0.2

catalamarti commented 1 year ago

Thank you @ablack3 definitely worth implementing it! I dont know if we should use condition_era, because not always is populated. But we can implement the same via condition_occurrence. What do you think @edward-burn ?

edward-burn commented 1 year ago

I think having this would be very nice, but I'd also have a preference to use the condition occurrence table.

A little like addDemographics and addAge, it could be nice to have addIndex and addCharlson approach so that we could add elixhauser etc (and share calculation using the condition occurrence table for efficiency)

catalamarti commented 1 year ago

Happy with this approach