imperialCHEPI / healthgpsrvis

A set of scripts for plotting and visualising data related to Health-GPS
https://imperialchepi.github.io/healthgpsrvis/
Other
0 stars 0 forks source link

Different data process for including confidence interval #40

Closed jzhu20 closed 2 months ago

jzhu20 commented 3 months ago

Confidence interval is obtained from variations in simulations. Therefore, different simulations need to be kept before taking mean, minimum or maximum values. A new data-process R script is as follows.

# Renaming variables if necessary, to remove prefix ----
names(data) <- sub("^mean_", "", names(data))

# Mean of simulations ----
## List in summarise() needs to be in line with data ##
## Excluding missing values ##

gen_data_weighted <- function(data) {

  data_weighted <- data |>
    group_by(source,time,simID) |>
    mutate(prevalence_stroke = prevalence_intracerebralhemorrhage+ prevalence_ischemicstroke + prevalence_subarachnoidhemorrhage,
           incidence_stroke = incidence_intracerebralhemorrhage + incidence_ischemicstroke + incidence_subarachnoidhemorrhage) |>
    summarise(weighted_income = weighted.mean(income,count),
              weighted_sector = weighted.mean(sector,count),
              weighted_sodium = weighted.mean(sodium,count,na.rm=TRUE),
              weighted_carbohydarte = weighted.mean(carbohydrate,count,na.rm=TRUE),
              weighted_fat = weighted.mean(fat,count,na.rm=TRUE),
              weighted_protein = weighted.mean(protein,count,na.rm=TRUE),
              weighted_energyintake = weighted.mean(energyintake,count,na.rm=TRUE),
              weighted_physicalactivity = weighted.mean(physicalactivity,count),
              weighted_bmi = weighted.mean(bmi,count,na.rm=TRUE),
              weighted_height = weighted.mean(height,count),
              weighted_weight = weighted.mean(weight,count,na.rm=TRUE),
              weighted_overweight = weighted.mean(over_weight,count),
              weighted_obesity = weighted.mean(obese_weight,count),
              wprev_ihd = weighted.mean(prevalence_ischemicheartdisease,count,na.rm=TRUE),
              wprev_diabetes = weighted.mean(prevalence_diabetes,count,na.rm=TRUE),
              wprev_stroke = weighted.mean(prevalence_stroke,count,na.rm=TRUE),
              wprev_asthma = weighted.mean(prevalence_asthma,count,na.rm=TRUE),
              wprev_ckd = weighted.mean(prevalence_chronickidneydisease , count,na.rm=TRUE),
              prevcase_ihd = sum(prevalence_ischemicheartdisease * count,na.rm=TRUE),
              prevcase_diabetes = sum(prevalence_diabetes * count,na.rm=TRUE),
              prevcase_stroke = sum(prevalence_stroke * count,na.rm=TRUE),
              prevcase_asthma = sum(prevalence_asthma * count,na.rm=TRUE),
              prevcase_ckd = sum(prevalence_chronickidneydisease * count,na.rm=TRUE),
              totalcase_ihd = sum(incidence_ischemicheartdisease * count,na.rm=TRUE),
              totalcase_diabetes = sum(incidence_diabetes * count,na.rm=TRUE),
              totalcase_stroke = sum(incidence_stroke * count,na.rm=TRUE),
              totalcase_asthma = sum(incidence_asthma * count,na.rm=TRUE),
              totalcase_ckd = sum(incidence_chronickidneydisease * count,na.rm=TRUE),
              weighted_disabilityweight = weighted.mean(disability_weight,count),
              weighted_death = weighted.mean(deaths,count),
              weighted_migrations = weighted.mean(migrations,count),
              total_yll = sum(yll*count,na.rm=TRUE),
              total_yld = sum(yld*count,na.rm=TRUE),
              total_daly = sum(daly*count,na.rm=TRUE))

return(data_weighted) 
}

## Diff ----
gen_data_weighted_rf <- function(data_weighted) {

  data_weighted_rf <- select(data_weighted, source, time, simID, weighted_sodium, 
                             weighted_energyintake, weighted_bmi,weighted_obesity)
  data_weighted_rf_wide <- pivot_wider(data_weighted_rf, 
                                       names_from=source,
                                       id_cols = c(time,simID),
                                       values_from = c(weighted_sodium, weighted_energyintake, 
                                                       weighted_bmi,weighted_obesity))
  data_weighted_rf_wide <- data_weighted_rf_wide |>
    mutate(diff_sodium = weighted_sodium_intervention - weighted_sodium_baseline,
           diff_ei = weighted_energyintake_intervention - weighted_energyintake_baseline,
           diff_bmi = weighted_bmi_intervention - weighted_bmi_baseline,
           diff_obesity = weighted_obesity_intervention - weighted_obesity_baseline)
  data_weighted_rf_wide_collapse <- data_weighted_rf_wide |>
    group_by(time) |>
    summarise(diff_sodium_mean = mean(diff_sodium),
              diff_sodium_min = min(diff_sodium),
              diff_sodium_max = max(diff_sodium),
              diff_ei_mean = mean(diff_ei),
              diff_ei_min = min(diff_ei),
              diff_ei_max = max(diff_ei),
              diff_bmi_mean = mean(diff_bmi),
              diff_bmi_min = min(diff_bmi),
              diff_bmi_max = max(diff_bmi),
              diff_obesity_mean = mean(diff_obesity),
              diff_obesity_min = min(diff_obesity),
              diff_obesity_max = max(diff_obesity))

return(data_weighted_rf_wide_collapse)
}
# Incidence diff ----

gen_data_weighted_ds <- function(data_weighted) {

  data_weighted_ds <- select(data_weighted, source, time, simID, totalcase_ihd, 
                             totalcase_diabetes,totalcase_stroke,totalcase_asthma,
                             totalcase_ckd)
  data_weighted_ds_wide <- pivot_wider(data_weighted_ds, 
                                       names_from=source,
                                       id_cols = c(time,simID),
                                       values_from = c(totalcase_ihd, 
                                                       totalcase_diabetes,
                                                       totalcase_stroke,
                                                       totalcase_asthma,
                                                       totalcase_ckd))
  data_weighted_ds_wide <- data_weighted_ds_wide |>
    mutate(diff_inc_ihd = 100*(totalcase_ihd_intervention - totalcase_ihd_baseline),
           diff_inc_db = 100*(totalcase_diabetes_intervention - totalcase_diabetes_baseline),
           diff_inc_stroke = 100*(totalcase_stroke_intervention - totalcase_stroke_baseline),
           diff_inc_asthma = 100*(totalcase_asthma_intervention - totalcase_asthma_baseline),
           diff_inc_ckd = 100*(totalcase_ckd_intervention - totalcase_ckd_baseline))
  data_weighted_ds_wide <- data_weighted_ds_wide |>
    group_by(simID) |>
    mutate(cumdiff_inc_ihd = cumsum(diff_inc_ihd),
           cumdiff_inc_db = cumsum(diff_inc_db),
           cumdiff_inc_stroke = cumsum(diff_inc_stroke),
           cumdiff_inc_asthma = cumsum(diff_inc_asthma),
           cumdiff_inc_ckd = cumsum(diff_inc_ckd))
  data_weighted_ds_wide_collapse <- data_weighted_ds_wide |>
    group_by(time) |>
    summarise(diff_inc_ihd_mean = mean(cumdiff_inc_ihd),
              diff_inc_ihd_min = min(cumdiff_inc_ihd),
              diff_inc_ihd_max = max(cumdiff_inc_ihd),
              diff_inc_db_mean = mean(cumdiff_inc_db),
              diff_inc_db_min = min(cumdiff_inc_db),
              diff_inc_db_max = max(cumdiff_inc_db),
              diff_inc_stroke_mean = mean(cumdiff_inc_stroke),
              diff_inc_stroke_min = min(cumdiff_inc_stroke),
              diff_inc_stroke_max = max(cumdiff_inc_stroke),
              diff_inc_asthma_mean = mean(cumdiff_inc_asthma),
              diff_inc_asthma_min = min(cumdiff_inc_asthma),
              diff_inc_asthma_max = max(cumdiff_inc_asthma),
              diff_inc_ckd_mean = mean(cumdiff_inc_ckd),
              diff_inc_ckd_min = min(cumdiff_inc_ckd),
              diff_inc_ckd_max = max(cumdiff_inc_ckd)
    )

return(data_weighted_ds_wide_collapse)
}

# Burden of disease ----

gen_data_weighted_burden <- function(data_weighted) {

  data_weighted_burden <- select(data_weighted, source, time, simID, total_yll, 
                                 total_yld,total_daly)
  data_weighted_burden_wide <- pivot_wider(data_weighted_burden, 
                                           names_from=source,
                                           id_cols = c(time,simID),
                                           values_from = c(total_yll,
                                                           total_yld,
                                                           total_daly))

  data_weighted_burden_wide <- data_weighted_burden_wide |>
    mutate(diff_yll = (total_yll_intervention - total_yll_baseline)/1000,
           diff_yld = (total_yld_intervention - total_yld_baseline)/1000,
           diff_daly = (total_daly_intervention - total_daly_baseline)/1000)

  data_weighted_burden_wide <- data_weighted_burden_wide |>
    group_by(simID) |>
    mutate(cumdiff_daly = cumsum(diff_daly),
           cumdiff_yll = cumsum(diff_yll),
           cumdiff_yld = cumsum(diff_yld))

  data_weighted_burden_wide_collapse <- data_weighted_burden_wide |>
    group_by(time) |>
    summarise(diff_daly_mean = mean(diff_daly),
              diff_daly_min = min(diff_daly),
              diff_daly_max = max(diff_daly),
              diff_yll_mean = mean(diff_yll),
              diff_yll_min = min(diff_yll),
              diff_yll_max = max(diff_yll),
              diff_yld_mean = mean(diff_yld),
              diff_yld_min = min(diff_yld),
              diff_yld_max = max(diff_yld),
              cumdiff_daly_mean = mean(cumdiff_daly),
              cumdiff_daly_min = min(cumdiff_daly),
              cumdiff_daly_max = max(cumdiff_daly),
              cumdiff_yll_mean = mean(cumdiff_yll),
              cumdiff_yll_min = min(cumdiff_yll),
              cumdiff_yll_max = max(cumdiff_yll),
              cumdiff_yld_mean = mean(cumdiff_yld),
              cumdiff_yld_min = min(cumdiff_yld),
              cumdiff_yld_max = max(cumdiff_yld))

return(data_weighted_burden_wide_collapse)
}

############
## This last function is data smoothing
## It is applied manually now in India project due to abnormal positive values in diff_daly or cumdiff_daly

gen_data_weighted_burden_spline <- function(data_weighted_burden) {

  ## Only keep those 0 or negative values

  ## Notes: Delete years 27,30,32-33 for ps3-low; Delete years 2028 for ps4-low

  data_weighted_burden_mean <- data_weighted_burden |>
    filter(cumdiff_daly_mean <= 0)

  data_weighted_burden_min <- data_weighted_burden |>
    filter(cumdiff_daly_min <= 0)

  ## Notes: Delete years 29, 31 for ps2-high; Delete 37-38 for ps3-low; Delete 33-34 for ps4-middle; Delete 36-38 for ps4-low

 data_weighted_burden_max <- data_weighted_burden |>
    filter(cumdiff_daly_max <= 0)

  ## New data frame
  data_weighted_burden_spline <- data.frame(time = seq(min(data_weighted_burden$time), max(data_weighted_burden$time), length.out = 34))

  ## Fit spline and predict
  spline_fit <- interpSpline(data_weighted_burden_mean$time, data_weighted_burden_mean$cumdiff_daly_mean)
  data_weighted_burden_spline$cumdiff_daly_mean <- predict(spline_fit, data_weighted_burden_spline$time)$y

  spline_fit_min <- interpSpline(data_weighted_burden_min$time, data_weighted_burden_min$cumdiff_daly_min)
  data_weighted_burden_spline$cumdiff_daly_min <- predict(spline_fit_min, data_weighted_burden_spline$time)$y

  ## Use smooth.spline for ps4-low
  spline_fit_max <- interpSpline(data_weighted_burden_max$time, data_weighted_burden_max$cumdiff_daly_max)
  data_weighted_burden_spline$cumdiff_daly_max <- predict(spline_fit_max, data_weighted_burden_spline$time)$y

  ## Keep 0 values in the first two years
  data_weighted_burden_spline$cumdiff_daly_mean <- ifelse(data_weighted_burden_spline$time<2024, 0, data_weighted_burden_spline$cumdiff_daly_mean)
  data_weighted_burden_spline$cumdiff_daly_min <- ifelse(data_weighted_burden_spline$time<2024, 0, data_weighted_burden_spline$cumdiff_daly_min)
  data_weighted_burden_spline$cumdiff_daly_max <- ifelse(data_weighted_burden_spline$time<2024, 0, data_weighted_burden_spline$cumdiff_daly_max)

  return(data_weighted_burden_spline)
}