dkesada / dbnR

Gaussian dynamic Bayesian networks structure learning and inference based on the bnlearn package
GNU General Public License v3.0
44 stars 10 forks source link

Standard format of f_dt #11

Closed zh-zhang1984 closed 2 years ago

zh-zhang1984 commented 2 years ago

I formatted my dataframe with the following names :

> names(phenoData_wide)
  [1] "age"                "sex"                "height"            
  [4] "weight"             "diabete"            "hyperten"          
  [7] "myoinfarc"          "cardiofailure"      "cerebrovasc"       
 [10] "dementia"           "copd"               "paralysis"         
 [13] "renafailure"        "mort_flg"           "MV_days"           
 [16] "CRRT_days"          "VASO_days"          "Hospital_days"     
 [19] "Batch"              "SampleLocation"     "hrmax_t_1"         
 [22] "hrmax_t_3"          "hrmax_t_5"          "hrmin_t_1"         
 [25] "hrmin_t_3"          "hrmin_t_5"          "mapmax_t_1"        
 [28] "mapmax_t_3"         "mapmax_t_5"         "mapmin_t_1"        
 [31] "mapmin_t_3"         "mapmin_t_5"         "sapmax_t_1"        
 [34] "sapmax_t_3"         "sapmax_t_5"         "sapmin_t_1"        
 [37] "sapmin_t_3"         "sapmin_t_5"         "rrmax_t_1"         
 [40] "rrmax_t_3"          "rrmax_t_5"          "rrmin_t_1"         
 [43] "rrmin_t_3"          "rrmin_t_5"          "tmax_t_1"          
 [46] "tmax_t_3"           "tmax_t_5"           "tmin_t_1"          
 [49] "tmin_t_3"           "tmin_t_5"           "mv_t_1"            
 [52] "mv_t_3"             "mv_t_5"             "crrt_t_1"          
 [55] "crrt_t_3"           "crrt_t_5"           "gcs_t_1"           
 [58] "gcs_t_3"            "gcs_t_5"            "lac_t_1"           
 [61] "lac_t_3"            "lac_t_5"            "k_t_1"             
 [64] "k_t_3"              "k_t_5"              "na_t_1"            
 [67] "na_t_3"             "na_t_5"             "cl_t_1"            
 [70] "cl_t_3"             "cl_t_5"             "ca_t_1"            
 [73] "ca_t_3"             "ca_t_5"             "pha_t_1"           
 [76] "pha_t_3"            "pha_t_5"            "paco_t_1"          
 [79] "paco_t_3"           "paco_t_5"           "pao_t_1"           
 [82] "pao_t_3"            "pao_t_5"            "abe_t_1"           
 [85] "abe_t_3"            "abe_t_5"            "fio_t_1"           
 [88] "fio_t_3"            "fio_t_5"            "SaO2_t_1"          
 [91] "SaO2_t_3"           "SaO2_t_5"           "procal_t_1"        
 [94] "procal_t_3"         "procal_t_5"         "phcv_t_1"          
 [97] "phcv_t_3"           "phcv_t_5"           "pcvco_t_1"         
[100] "pcvco_t_3"          "pcvco_t_5"          "pcvo_t_1"          
[103] "pcvo_t_3"           "pcvo_t_5"           "scvo_t_1"          
[106] "scvo_t_3"           "scvo_t_5"           "bun_t_1"           
[109] "bun_t_3"            "bun_t_5"            "alb_t_1"           
[112] "alb_t_3"            "alb_t_5"            "cr_t_1"            
[115] "cr_t_3"             "cr_t_5"             "bilirubin_t_1"     
[118] "bilirubin_t_3"      "bilirubin_t_5"      "crp_t_1"           
[121] "crp_t_3"            "crp_t_5"            "wbc_t_1"           
[124] "wbc_t_3"            "wbc_t_5"            "hct_t_1"           
[127] "hct_t_3"            "hct_t_5"            "plt_t_1"           
[130] "plt_t_3"            "plt_t_5"            "inr_t_1"           
[133] "inr_t_3"            "inr_t_5"            "aptt_t_1"          
[136] "aptt_t_3"           "aptt_t_5"           "tt_t_1"            
[139] "tt_t_3"             "tt_t_5"             "ddimer_t_1"        
[142] "ddimer_t_3"         "ddimer_t_5"         "urine_t_1"         
[145] "urine_t_3"          "urine_t_5"          "sofa_pf_t_1"       
[148] "sofa_pf_t_3"        "sofa_pf_t_5"        "sofa_plat_t_1"     
[151] "sofa_plat_t_3"      "sofa_plat_t_5"      "sofa_GCS_t_1"      
[154] "sofa_GCS_t_3"       "sofa_GCS_t_5"       "sofa_bilirubin_t_1"
[157] "sofa_bilirubin_t_3" "sofa_bilirubin_t_5" "sofa_vaso_t_1"     
[160] "sofa_vaso_t_3"      "sofa_vaso_t_5"      "sofa_cr_t_1"       
[163] "sofa_cr_t_3"        "sofa_cr_t_5"        "sofa_uo_t_1"       
[166] "sofa_uo_t_3"        "sofa_uo_t_5"        "SOFA_t_1"          
[169] "SOFA_t_3"           "SOFA_t_5"           "UTI_flg_t_1"       
[172] "UTI_flg_t_3"        "UTI_flg_t_5"        "UTI_dose_t_1"      
[175] "UTI_dose_t_3"       "UTI_dose_t_5"       "fluidin_t_1"       
[178] "fluidin_t_3"        "fluidin_t_5"        "fluidout_t_1"      
[181] "fluidout_t_3"       "fluidout_t_5"       "pf_t_1"            
[184] "pf_t_3"             "pf_t_5"   

I think this is not standard f_dt format, and the function reported error;

> net <- dbnR::learn_dbn_struc(
+   phenoData_wide, 
+   f_dt = phenoData_wide, method = "dmmhc", 
+   #blacklist = blacklist,
+   #blacklist_tr = blacklist_tr, 
+   restrict = "mmpc", maximize = "hc",
+   restrict.args = list(test = "cor"),
+   maximize.args = list(score = "bic-g", maxp = 10))
Error in initial_folded_dt_check(f_dt) : 
  the data.frame is not properly time formatted.

How may I model these variables that containing both time-varying and time-fixed variables?

dkesada commented 2 years ago

Hi @zh-zhang1984

The dataset used for learning the network structure needs to have all variables with the proper names from t_0 to t_n, whithout skipping time slices in between. You can use the fold_dt function to generate the dataset automatically, learn the structure of a DBN with that data using learn_dbn_struc and then afterwards drop the arcs you don't want from that network with bnlearn::drop.arc. You can drop the arcs of all the nodes you do not want in the network, for example to remove all the arcs from the nodes in t_2 we could do:

names_t2 <- names(f_dt)[grepl("_t_2", names(f_dt))] # Names of the nodes in t_2
from_t2 <- net$arcs[net$arcs[,1] %in% names_t2,] # Arcs coming out form t_2
to_t2 <- net$arcs[net$arcs[,2] %in% names_t2,] # Arcs pointing to t_2
arcs_t2 <- rbind(from_t2, to_t2)
for(i in dim(arcs_t2)[1])
    net <- bnlearn::drop.arc(net, arcs_t2[i,1], arcs_t2[1,2]) # Delete all the arcs

After that, you can fit the parameters as usual with fit_dbn_params. With a similar procedure, you can have time-fixed variables appear all time-slices in the network but only allow arcs from a single time-slice afterwards. Their value can also be fixed during forecasting with the prov_ev argument of the forecast_ts function. The thing is that all the nodes from t_0 to t_n need to be present in the network for the moving window of the forecasting, even if the nodes do not have any arcs.

Also, you seem to have some categorical variables in your data (sex, diabete, ... ). Keep in mind that dbnR only works for continuous data and it does not allow creating hybrid networks.

zh-zhang1984 commented 2 years ago

Thank you for your detailed instructions, that's really helpful.