njtierney / mmcc

Fast, tidy functions for mcmc diagnostics and summaries, built with data.table
http://mmcc.njtierney.com/
Other
24 stars 5 forks source link

Separating parameters and their indices #44

Open bquilty25 opened 5 years ago

bquilty25 commented 5 years ago

Feature request: Ability to separate parameters and their indices into multiple columns, e.g beta[3,1] becomes beta, 3, 1.

samclifford commented 5 years ago

Thanks Billy.

I've had a look over this and I think something like the following will work. I've made a MWE to show how it can be done but haven't tested it on the output from mmcc::tidy(). You end up with a lot of NAs as there'll be uneven indexing levels.

## MWE for dealing with indices
## we want to be able to separate out the indexing levels from the base variable

library(tidyverse)
library(data.table)

x <- expand.grid(parameter = "beta",
                 i = 1:2,
                 j = 1:10) %>%
    dplyr::transmute(parameter = sprintf("%s[%s,%s]", parameter, i, j)) %>%
    dplyr::bind_rows(data.frame(parameter = sprintf("gamma[%s]", 1:2))) %>%
    dplyr::bind_rows(data.frame(parameter = "tau.y")) %>%
    data.table::data.table(.)

label_to_df <- function(x){

    x <- unlist(strsplit(x, "(\\[|\\]|,)"))
    names(x) <- paste0("level", 0:(length(x)-1))
    return(data.frame(as.list(x)))

}

parameter_to_index <- function(x){

    parameters <- as.list(unlist(x$parameter))

    parameters_levelled <- lapply(FUN = label_to_df, X = parameters)
    names(parameters_levelled) <- parameters

    parameters_df <- dplyr::bind_rows(parameters_levelled, 
                                      .id="parameter") %>%
        data.table::data.table(.) %>%
        data.table:::merge.data.table(x, .)

    return(parameters_df)

}

parameter_to_index(x)
samclifford commented 5 years ago

The function names here are probably not the greatest, but thanks for the suggestion. Should work with tidy and mcmc_to_dt functions.

njtierney commented 5 years ago

Looks good to me, I've made a few small changes to your code @samclifford and tried some different function names - let me know what you think.

## MWE for dealing with indices
## we want to be able to separate out the indexing levels from the base variable
## Nick minor edit: 
    # * use `glue` instead of `sprintf`, 
    # * use `tibble::add_row`
    # * use `purrr::map` and `purrr::set_names`
    # * try different function names

library(data.table)
library(tidyverse)

dat_param <- expand.grid(parameter = "beta",
                 i = 1:2,
                 j = 1:4) %>%
    dplyr::transmute(parameter = glue::glue("{parameter}[{i},{j}]")) %>%
    tibble::add_row(parameter = glue::glue("gamma[{1:2}]")) %>%
    tibble::add_row(parameter = "tau.y") %>%
    data.table::data.table(.)

dat_param
#>     parameter
#>  1: beta[1,1]
#>  2: beta[2,1]
#>  3: beta[1,2]
#>  4: beta[2,2]
#>  5: beta[1,3]
#>  6: beta[2,3]
#>  7: beta[1,4]
#>  8: beta[2,4]
#>  9:  gamma[1]
#> 10:  gamma[2]
#> 11:     tau.y

flatten_params <- function(x){

    x <- unlist(strsplit(x, "(\\[|\\]|,)"))
    names(x) <- paste0("level", 0:(length(x)-1))
    return(data.frame(as.list(x)))

}

flatten_params("beta[1,1]")
#>   level0 level1 level2
#> 1   beta      1      1
flatten_params("beta[1,2]")
#>   level0 level1 level2
#> 1   beta      1      2

flatten_params_df <- function(x){

    parameters_levelled <- purrr::map(x$parameter, flatten_params) %>%
    purrr::set_names(nm = x$parameter)

    parameters_df <- dplyr::bind_rows(parameters_levelled, 
                                      .id = "parameter") %>%
        data.table::data.table(.) %>%
        data.table:::merge.data.table(x, .)

    return(parameters_df)

}

dat_param
#>     parameter
#>  1: beta[1,1]
#>  2: beta[2,1]
#>  3: beta[1,2]
#>  4: beta[2,2]
#>  5: beta[1,3]
#>  6: beta[2,3]
#>  7: beta[1,4]
#>  8: beta[2,4]
#>  9:  gamma[1]
#> 10:  gamma[2]
#> 11:     tau.y

flatten_params_df(dat_param)
#> Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector
#> Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector
#> Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector

#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector
#>     parameter level0 level1 level2
#>  1: beta[1,1]   beta      1      1
#>  2: beta[1,2]   beta      1      2
#>  3: beta[1,3]   beta      1      3
#>  4: beta[1,4]   beta      1      4
#>  5: beta[2,1]   beta      2      1
#>  6: beta[2,2]   beta      2      2
#>  7: beta[2,3]   beta      2      3
#>  8: beta[2,4]   beta      2      4
#>  9:  gamma[1]  gamma      1   <NA>
#> 10:  gamma[2]  gamma      2   <NA>
#> 11:     tau.y  tau.y   <NA>   <NA>

Created on 2019-03-25 by the reprex package (v0.2.1)

What do you think about wrapping this up in an option in tidy and mcmc_to_dt as flatten_params = TRUE/FALSE?

samclifford commented 5 years ago

Thanks @njtierney. I think flatten_params as a function name and an argument to tidy works well. Happy with the edited code, I was just trying to reduce dependencies.