idem-lab / map-ir-pipeline

Prototype demonstration of stacked generalisation method used in https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.3000633#sec010
3 stars 1 forks source link

Ensure workflow works or flags error when phenotypic/genotypic data ratio is off #43

Open njtierney opened 9 months ago

njtierney commented 9 months ago

Currently if you have about 5% phenotypic data, then later on when you are doing test/training datasets you end up running out of rows of data to subset down to.

njtierney commented 9 months ago

Related / demonstration of this:

library(tidyverse)
library(tidymodels)

create_data <- function(n){
    tibble(
      country = "Australia",
      month = seq_len(n),
      percent_mortality = runif(n)
  )
}

dat_ph <- create_data(5)

dat_geno <- create_data(95)

dat_ph
#> # A tibble: 5 × 3
#>   country   month percent_mortality
#>   <chr>     <int>             <dbl>
#> 1 Australia     1             0.328
#> 2 Australia     2             0.238
#> 3 Australia     3             0.587
#> 4 Australia     4             0.105
#> 5 Australia     5             0.804

dat_geno
#> # A tibble: 95 × 3
#>    country   month percent_mortality
#>    <chr>     <int>             <dbl>
#>  1 Australia     1            0.0681
#>  2 Australia     2            0.689 
#>  3 Australia     3            0.665 
#>  4 Australia     4            0.940 
#>  5 Australia     5            0.969 
#>  6 Australia     6            0.923 
#>  7 Australia     7            0.471 
#>  8 Australia     8            0.775 
#>  9 Australia     9            0.274 
#> 10 Australia    10            0.525 
#> # ℹ 85 more rows

ir_data_mn_star <- bind_rows(
  phenotype = dat_ph,
  genotype = dat_geno,
  .id = "type"
)

ir_data_mn_star
#> # A tibble: 100 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.328 
#>  2 phenotype Australia     2            0.238 
#>  3 phenotype Australia     3            0.587 
#>  4 phenotype Australia     4            0.105 
#>  5 phenotype Australia     5            0.804 
#>  6 genotype  Australia     1            0.0681
#>  7 genotype  Australia     2            0.689 
#>  8 genotype  Australia     3            0.665 
#>  9 genotype  Australia     4            0.940 
#> 10 genotype  Australia     5            0.969 
#> # ℹ 90 more rows

train_predict <- vfold_cv(
  data = ir_data_mn_star,
  v = 10
)

train_predict
#> #  10-fold cross-validation 
#> # A tibble: 10 × 2
#>    splits          id    
#>    <list>          <chr> 
#>  1 <split [90/10]> Fold01
#>  2 <split [90/10]> Fold02
#>  3 <split [90/10]> Fold03
#>  4 <split [90/10]> Fold04
#>  5 <split [90/10]> Fold05
#>  6 <split [90/10]> Fold06
#>  7 <split [90/10]> Fold07
#>  8 <split [90/10]> Fold08
#>  9 <split [90/10]> Fold09
#> 10 <split [90/10]> Fold10

extract_training <- function(data) map(data$splits, training)

extract_predict <- function(data) {
  map(data$splits, testing) %>%
    map(\(x) filter(x, type == "phenotype"))
}

n_pheno <- function(data){
  map_dbl(
    data, 
    function(x) {
      type <- x$type
      sum(type == "phenotype")
    }
  )
}

# this should be N* + M*
train_data_mn_star <- extract_training(train_predict)
train_data_mn_star
#> [[1]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.328 
#>  2 phenotype Australia     2            0.238 
#>  3 phenotype Australia     3            0.587 
#>  4 phenotype Australia     4            0.105 
#>  5 phenotype Australia     5            0.804 
#>  6 genotype  Australia     1            0.0681
#>  7 genotype  Australia     2            0.689 
#>  8 genotype  Australia     3            0.665 
#>  9 genotype  Australia     4            0.940 
#> 10 genotype  Australia     5            0.969 
#> # ℹ 80 more rows
#> 
#> [[2]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.328 
#>  2 phenotype Australia     2            0.238 
#>  3 phenotype Australia     3            0.587 
#>  4 phenotype Australia     4            0.105 
#>  5 phenotype Australia     5            0.804 
#>  6 genotype  Australia     1            0.0681
#>  7 genotype  Australia     2            0.689 
#>  8 genotype  Australia     4            0.940 
#>  9 genotype  Australia     5            0.969 
#> 10 genotype  Australia     6            0.923 
#> # ℹ 80 more rows
#> 
#> [[3]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.328 
#>  2 phenotype Australia     2            0.238 
#>  3 phenotype Australia     3            0.587 
#>  4 phenotype Australia     4            0.105 
#>  5 genotype  Australia     1            0.0681
#>  6 genotype  Australia     3            0.665 
#>  7 genotype  Australia     4            0.940 
#>  8 genotype  Australia     5            0.969 
#>  9 genotype  Australia     6            0.923 
#> 10 genotype  Australia     7            0.471 
#> # ℹ 80 more rows
#> 
#> [[4]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.328 
#>  2 phenotype Australia     3            0.587 
#>  3 phenotype Australia     4            0.105 
#>  4 phenotype Australia     5            0.804 
#>  5 genotype  Australia     1            0.0681
#>  6 genotype  Australia     2            0.689 
#>  7 genotype  Australia     3            0.665 
#>  8 genotype  Australia     4            0.940 
#>  9 genotype  Australia     5            0.969 
#> 10 genotype  Australia     6            0.923 
#> # ℹ 80 more rows
#> 
#> [[5]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.328 
#>  2 phenotype Australia     2            0.238 
#>  3 phenotype Australia     3            0.587 
#>  4 phenotype Australia     4            0.105 
#>  5 phenotype Australia     5            0.804 
#>  6 genotype  Australia     1            0.0681
#>  7 genotype  Australia     2            0.689 
#>  8 genotype  Australia     3            0.665 
#>  9 genotype  Australia     4            0.940 
#> 10 genotype  Australia     5            0.969 
#> # ℹ 80 more rows
#> 
#> [[6]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     2            0.238 
#>  2 phenotype Australia     3            0.587 
#>  3 phenotype Australia     4            0.105 
#>  4 phenotype Australia     5            0.804 
#>  5 genotype  Australia     1            0.0681
#>  6 genotype  Australia     2            0.689 
#>  7 genotype  Australia     3            0.665 
#>  8 genotype  Australia     4            0.940 
#>  9 genotype  Australia     5            0.969 
#> 10 genotype  Australia     6            0.923 
#> # ℹ 80 more rows
#> 
#> [[7]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.328 
#>  2 phenotype Australia     2            0.238 
#>  3 phenotype Australia     3            0.587 
#>  4 phenotype Australia     5            0.804 
#>  5 genotype  Australia     1            0.0681
#>  6 genotype  Australia     2            0.689 
#>  7 genotype  Australia     3            0.665 
#>  8 genotype  Australia     4            0.940 
#>  9 genotype  Australia     5            0.969 
#> 10 genotype  Australia     6            0.923 
#> # ℹ 80 more rows
#> 
#> [[8]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1             0.328
#>  2 phenotype Australia     2             0.238
#>  3 phenotype Australia     3             0.587
#>  4 phenotype Australia     4             0.105
#>  5 phenotype Australia     5             0.804
#>  6 genotype  Australia     2             0.689
#>  7 genotype  Australia     3             0.665
#>  8 genotype  Australia     4             0.940
#>  9 genotype  Australia     5             0.969
#> 10 genotype  Australia     7             0.471
#> # ℹ 80 more rows
#> 
#> [[9]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.328 
#>  2 phenotype Australia     2            0.238 
#>  3 phenotype Australia     4            0.105 
#>  4 phenotype Australia     5            0.804 
#>  5 genotype  Australia     1            0.0681
#>  6 genotype  Australia     2            0.689 
#>  7 genotype  Australia     3            0.665 
#>  8 genotype  Australia     5            0.969 
#>  9 genotype  Australia     6            0.923 
#> 10 genotype  Australia     7            0.471 
#> # ℹ 80 more rows
#> 
#> [[10]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.328 
#>  2 phenotype Australia     2            0.238 
#>  3 phenotype Australia     3            0.587 
#>  4 phenotype Australia     4            0.105 
#>  5 phenotype Australia     5            0.804 
#>  6 genotype  Australia     1            0.0681
#>  7 genotype  Australia     2            0.689 
#>  8 genotype  Australia     3            0.665 
#>  9 genotype  Australia     4            0.940 
#> 10 genotype  Australia     6            0.923 
#> # ℹ 80 more rows
n_pheno(train_data_mn_star)
#>  [1] 5 5 4 4 5 4 4 5 4 5
# this should just be N*
predict_data_nstar <- extract_predict(train_predict)
predict_data_nstar
#> [[1]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> #   percent_mortality <dbl>
#> 
#> [[2]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> #   percent_mortality <dbl>
#> 
#> [[3]]
#> # A tibble: 1 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia     5             0.804
#> 
#> [[4]]
#> # A tibble: 1 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia     2             0.238
#> 
#> [[5]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> #   percent_mortality <dbl>
#> 
#> [[6]]
#> # A tibble: 1 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia     1             0.328
#> 
#> [[7]]
#> # A tibble: 1 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia     4             0.105
#> 
#> [[8]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> #   percent_mortality <dbl>
#> 
#> [[9]]
#> # A tibble: 1 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia     3             0.587
#> 
#> [[10]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> #   percent_mortality <dbl>
n_pheno(predict_data_nstar)
#>  [1] 0 0 1 1 0 1 1 0 1 0

# maybe if we try using strata? --- slightly better?
train_predict_strata <- vfold_cv(
  data = ir_data_mn_star,
  v = 10,
  strata = type
)

# this should be N* + M*
train_data_mn_star_strata <- extract_training(train_predict_strata)
train_data_mn_star_strata
#> [[1]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.328 
#>  2 phenotype Australia     2            0.238 
#>  3 phenotype Australia     3            0.587 
#>  4 phenotype Australia     4            0.105 
#>  5 phenotype Australia     5            0.804 
#>  6 genotype  Australia     1            0.0681
#>  7 genotype  Australia     3            0.665 
#>  8 genotype  Australia     4            0.940 
#>  9 genotype  Australia     5            0.969 
#> 10 genotype  Australia     6            0.923 
#> # ℹ 80 more rows
#> 
#> [[2]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.328 
#>  2 phenotype Australia     2            0.238 
#>  3 phenotype Australia     3            0.587 
#>  4 phenotype Australia     4            0.105 
#>  5 phenotype Australia     5            0.804 
#>  6 genotype  Australia     1            0.0681
#>  7 genotype  Australia     2            0.689 
#>  8 genotype  Australia     3            0.665 
#>  9 genotype  Australia     4            0.940 
#> 10 genotype  Australia     5            0.969 
#> # ℹ 80 more rows
#> 
#> [[3]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.328 
#>  2 phenotype Australia     2            0.238 
#>  3 phenotype Australia     5            0.804 
#>  4 genotype  Australia     1            0.0681
#>  5 genotype  Australia     2            0.689 
#>  6 genotype  Australia     3            0.665 
#>  7 genotype  Australia     4            0.940 
#>  8 genotype  Australia     5            0.969 
#>  9 genotype  Australia     6            0.923 
#> 10 genotype  Australia     7            0.471 
#> # ℹ 80 more rows
#> 
#> [[4]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.328 
#>  2 phenotype Australia     2            0.238 
#>  3 phenotype Australia     3            0.587 
#>  4 phenotype Australia     4            0.105 
#>  5 phenotype Australia     5            0.804 
#>  6 genotype  Australia     1            0.0681
#>  7 genotype  Australia     2            0.689 
#>  8 genotype  Australia     3            0.665 
#>  9 genotype  Australia     4            0.940 
#> 10 genotype  Australia     5            0.969 
#> # ℹ 80 more rows
#> 
#> [[5]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.328 
#>  2 phenotype Australia     2            0.238 
#>  3 phenotype Australia     3            0.587 
#>  4 phenotype Australia     4            0.105 
#>  5 phenotype Australia     5            0.804 
#>  6 genotype  Australia     1            0.0681
#>  7 genotype  Australia     2            0.689 
#>  8 genotype  Australia     3            0.665 
#>  9 genotype  Australia     4            0.940 
#> 10 genotype  Australia     5            0.969 
#> # ℹ 80 more rows
#> 
#> [[6]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.328 
#>  2 phenotype Australia     2            0.238 
#>  3 phenotype Australia     3            0.587 
#>  4 phenotype Australia     4            0.105 
#>  5 phenotype Australia     5            0.804 
#>  6 genotype  Australia     1            0.0681
#>  7 genotype  Australia     2            0.689 
#>  8 genotype  Australia     3            0.665 
#>  9 genotype  Australia     4            0.940 
#> 10 genotype  Australia     5            0.969 
#> # ℹ 80 more rows
#> 
#> [[7]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1             0.328
#>  2 phenotype Australia     2             0.238
#>  3 phenotype Australia     3             0.587
#>  4 phenotype Australia     4             0.105
#>  5 phenotype Australia     5             0.804
#>  6 genotype  Australia     2             0.689
#>  7 genotype  Australia     3             0.665
#>  8 genotype  Australia     5             0.969
#>  9 genotype  Australia     6             0.923
#> 10 genotype  Australia     8             0.775
#> # ℹ 80 more rows
#> 
#> [[8]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.328 
#>  2 phenotype Australia     2            0.238 
#>  3 phenotype Australia     3            0.587 
#>  4 phenotype Australia     4            0.105 
#>  5 phenotype Australia     5            0.804 
#>  6 genotype  Australia     1            0.0681
#>  7 genotype  Australia     2            0.689 
#>  8 genotype  Australia     4            0.940 
#>  9 genotype  Australia     5            0.969 
#> 10 genotype  Australia     6            0.923 
#> # ℹ 80 more rows
#> 
#> [[9]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     2            0.238 
#>  2 phenotype Australia     3            0.587 
#>  3 phenotype Australia     4            0.105 
#>  4 genotype  Australia     1            0.0681
#>  5 genotype  Australia     2            0.689 
#>  6 genotype  Australia     3            0.665 
#>  7 genotype  Australia     4            0.940 
#>  8 genotype  Australia     6            0.923 
#>  9 genotype  Australia     7            0.471 
#> 10 genotype  Australia     8            0.775 
#> # ℹ 80 more rows
#> 
#> [[10]]
#> # A tibble: 90 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.328 
#>  2 phenotype Australia     3            0.587 
#>  3 phenotype Australia     4            0.105 
#>  4 phenotype Australia     5            0.804 
#>  5 genotype  Australia     1            0.0681
#>  6 genotype  Australia     2            0.689 
#>  7 genotype  Australia     3            0.665 
#>  8 genotype  Australia     4            0.940 
#>  9 genotype  Australia     5            0.969 
#> 10 genotype  Australia     6            0.923 
#> # ℹ 80 more rows
n_pheno(train_data_mn_star_strata)
#>  [1] 5 5 3 5 5 5 5 5 3 4
# this should just be N*
predict_data_nstar_strata <- extract_predict(train_predict_strata)
predict_data_nstar_strata
#> [[1]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> #   percent_mortality <dbl>
#> 
#> [[2]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> #   percent_mortality <dbl>
#> 
#> [[3]]
#> # A tibble: 2 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia     3             0.587
#> 2 phenotype Australia     4             0.105
#> 
#> [[4]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> #   percent_mortality <dbl>
#> 
#> [[5]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> #   percent_mortality <dbl>
#> 
#> [[6]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> #   percent_mortality <dbl>
#> 
#> [[7]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> #   percent_mortality <dbl>
#> 
#> [[8]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> #   percent_mortality <dbl>
#> 
#> [[9]]
#> # A tibble: 2 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia     1             0.328
#> 2 phenotype Australia     5             0.804
#> 
#> [[10]]
#> # A tibble: 1 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia     2             0.238
n_pheno(predict_data_nstar_strata)
#>  [1] 0 0 2 0 0 0 0 0 2 1

## OK but what if we increase the data size?
dat_ph_larger <- create_data(50)
dat_ph_larger
#> # A tibble: 50 × 3
#>    country   month percent_mortality
#>    <chr>     <int>             <dbl>
#>  1 Australia     1            0.893 
#>  2 Australia     2            0.962 
#>  3 Australia     3            0.158 
#>  4 Australia     4            0.380 
#>  5 Australia     5            0.778 
#>  6 Australia     6            0.805 
#>  7 Australia     7            0.0621
#>  8 Australia     8            0.172 
#>  9 Australia     9            0.540 
#> 10 Australia    10            0.412 
#> # ℹ 40 more rows

dat_geno_larger <- create_data(950)
dat_geno_larger
#> # A tibble: 950 × 3
#>    country   month percent_mortality
#>    <chr>     <int>             <dbl>
#>  1 Australia     1           0.227  
#>  2 Australia     2           0.582  
#>  3 Australia     3           0.108  
#>  4 Australia     4           0.0219 
#>  5 Australia     5           1.00   
#>  6 Australia     6           0.645  
#>  7 Australia     7           0.0702 
#>  8 Australia     8           0.533  
#>  9 Australia     9           0.00756
#> 10 Australia    10           0.0439 
#> # ℹ 940 more rows

ir_data_mn_star_larger <- bind_rows(
  phenotype = dat_ph_larger,
  genotype = dat_geno_larger,
  .id = "type"
)

ir_data_mn_star_larger
#> # A tibble: 1,000 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.893 
#>  2 phenotype Australia     2            0.962 
#>  3 phenotype Australia     3            0.158 
#>  4 phenotype Australia     4            0.380 
#>  5 phenotype Australia     5            0.778 
#>  6 phenotype Australia     6            0.805 
#>  7 phenotype Australia     7            0.0621
#>  8 phenotype Australia     8            0.172 
#>  9 phenotype Australia     9            0.540 
#> 10 phenotype Australia    10            0.412 
#> # ℹ 990 more rows

train_predict_larger <- vfold_cv(
  data = ir_data_mn_star_larger,
  v = 10
)

train_predict_larger
#> #  10-fold cross-validation 
#> # A tibble: 10 × 2
#>    splits            id    
#>    <list>            <chr> 
#>  1 <split [900/100]> Fold01
#>  2 <split [900/100]> Fold02
#>  3 <split [900/100]> Fold03
#>  4 <split [900/100]> Fold04
#>  5 <split [900/100]> Fold05
#>  6 <split [900/100]> Fold06
#>  7 <split [900/100]> Fold07
#>  8 <split [900/100]> Fold08
#>  9 <split [900/100]> Fold09
#> 10 <split [900/100]> Fold10

# this should be N* + M*
train_data_mn_star_larger <- extract_training(train_predict_larger)
train_data_mn_star_larger
#> [[1]]
#> # A tibble: 900 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.893 
#>  2 phenotype Australia     2            0.962 
#>  3 phenotype Australia     3            0.158 
#>  4 phenotype Australia     4            0.380 
#>  5 phenotype Australia     5            0.778 
#>  6 phenotype Australia     6            0.805 
#>  7 phenotype Australia     7            0.0621
#>  8 phenotype Australia     8            0.172 
#>  9 phenotype Australia    10            0.412 
#> 10 phenotype Australia    11            0.188 
#> # ℹ 890 more rows
#> 
#> [[2]]
#> # A tibble: 900 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.893 
#>  2 phenotype Australia     2            0.962 
#>  3 phenotype Australia     3            0.158 
#>  4 phenotype Australia     4            0.380 
#>  5 phenotype Australia     5            0.778 
#>  6 phenotype Australia     6            0.805 
#>  7 phenotype Australia     7            0.0621
#>  8 phenotype Australia     8            0.172 
#>  9 phenotype Australia     9            0.540 
#> 10 phenotype Australia    10            0.412 
#> # ℹ 890 more rows
#> 
#> [[3]]
#> # A tibble: 900 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.893 
#>  2 phenotype Australia     2            0.962 
#>  3 phenotype Australia     3            0.158 
#>  4 phenotype Australia     4            0.380 
#>  5 phenotype Australia     5            0.778 
#>  6 phenotype Australia     6            0.805 
#>  7 phenotype Australia     7            0.0621
#>  8 phenotype Australia     8            0.172 
#>  9 phenotype Australia     9            0.540 
#> 10 phenotype Australia    10            0.412 
#> # ℹ 890 more rows
#> 
#> [[4]]
#> # A tibble: 900 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.893 
#>  2 phenotype Australia     2            0.962 
#>  3 phenotype Australia     3            0.158 
#>  4 phenotype Australia     4            0.380 
#>  5 phenotype Australia     5            0.778 
#>  6 phenotype Australia     7            0.0621
#>  7 phenotype Australia     8            0.172 
#>  8 phenotype Australia     9            0.540 
#>  9 phenotype Australia    10            0.412 
#> 10 phenotype Australia    11            0.188 
#> # ℹ 890 more rows
#> 
#> [[5]]
#> # A tibble: 900 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1             0.893
#>  2 phenotype Australia     3             0.158
#>  3 phenotype Australia     5             0.778
#>  4 phenotype Australia     6             0.805
#>  5 phenotype Australia     8             0.172
#>  6 phenotype Australia     9             0.540
#>  7 phenotype Australia    11             0.188
#>  8 phenotype Australia    12             0.796
#>  9 phenotype Australia    13             0.124
#> 10 phenotype Australia    14             0.705
#> # ℹ 890 more rows
#> 
#> [[6]]
#> # A tibble: 900 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.893 
#>  2 phenotype Australia     2            0.962 
#>  3 phenotype Australia     3            0.158 
#>  4 phenotype Australia     4            0.380 
#>  5 phenotype Australia     6            0.805 
#>  6 phenotype Australia     7            0.0621
#>  7 phenotype Australia     8            0.172 
#>  8 phenotype Australia     9            0.540 
#>  9 phenotype Australia    10            0.412 
#> 10 phenotype Australia    11            0.188 
#> # ℹ 890 more rows
#> 
#> [[7]]
#> # A tibble: 900 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.893 
#>  2 phenotype Australia     2            0.962 
#>  3 phenotype Australia     3            0.158 
#>  4 phenotype Australia     4            0.380 
#>  5 phenotype Australia     5            0.778 
#>  6 phenotype Australia     6            0.805 
#>  7 phenotype Australia     7            0.0621
#>  8 phenotype Australia     8            0.172 
#>  9 phenotype Australia     9            0.540 
#> 10 phenotype Australia    10            0.412 
#> # ℹ 890 more rows
#> 
#> [[8]]
#> # A tibble: 900 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.893 
#>  2 phenotype Australia     2            0.962 
#>  3 phenotype Australia     3            0.158 
#>  4 phenotype Australia     4            0.380 
#>  5 phenotype Australia     5            0.778 
#>  6 phenotype Australia     6            0.805 
#>  7 phenotype Australia     7            0.0621
#>  8 phenotype Australia     8            0.172 
#>  9 phenotype Australia     9            0.540 
#> 10 phenotype Australia    10            0.412 
#> # ℹ 890 more rows
#> 
#> [[9]]
#> # A tibble: 900 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     2            0.962 
#>  2 phenotype Australia     3            0.158 
#>  3 phenotype Australia     4            0.380 
#>  4 phenotype Australia     5            0.778 
#>  5 phenotype Australia     6            0.805 
#>  6 phenotype Australia     7            0.0621
#>  7 phenotype Australia     9            0.540 
#>  8 phenotype Australia    10            0.412 
#>  9 phenotype Australia    11            0.188 
#> 10 phenotype Australia    12            0.796 
#> # ℹ 890 more rows
#> 
#> [[10]]
#> # A tibble: 900 × 4
#>    type      country   month percent_mortality
#>    <chr>     <chr>     <int>             <dbl>
#>  1 phenotype Australia     1            0.893 
#>  2 phenotype Australia     2            0.962 
#>  3 phenotype Australia     4            0.380 
#>  4 phenotype Australia     5            0.778 
#>  5 phenotype Australia     6            0.805 
#>  6 phenotype Australia     7            0.0621
#>  7 phenotype Australia     8            0.172 
#>  8 phenotype Australia     9            0.540 
#>  9 phenotype Australia    10            0.412 
#> 10 phenotype Australia    11            0.188 
#> # ℹ 890 more rows
n_pheno(train_data_mn_star_larger)
#>  [1] 43 45 48 45 45 44 45 46 44 45
# this should just be N*
predict_data_nstar_larger <- extract_predict(train_predict_larger)
predict_data_nstar_larger
#> [[1]]
#> # A tibble: 7 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia     9             0.540
#> 2 phenotype Australia    18             0.882
#> 3 phenotype Australia    22             0.768
#> 4 phenotype Australia    30             0.205
#> 5 phenotype Australia    34             0.545
#> 6 phenotype Australia    36             0.120
#> 7 phenotype Australia    37             0.668
#> 
#> [[2]]
#> # A tibble: 5 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia    14            0.705 
#> 2 phenotype Australia    20            0.771 
#> 3 phenotype Australia    29            0.0224
#> 4 phenotype Australia    39            0.347 
#> 5 phenotype Australia    45            0.368 
#> 
#> [[3]]
#> # A tibble: 2 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia    11             0.188
#> 2 phenotype Australia    15             0.435
#> 
#> [[4]]
#> # A tibble: 5 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia     6             0.805
#> 2 phenotype Australia    21             0.356
#> 3 phenotype Australia    26             0.387
#> 4 phenotype Australia    33             0.936
#> 5 phenotype Australia    43             0.623
#> 
#> [[5]]
#> # A tibble: 5 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia     2            0.962 
#> 2 phenotype Australia     4            0.380 
#> 3 phenotype Australia     7            0.0621
#> 4 phenotype Australia    10            0.412 
#> 5 phenotype Australia    19            0.122 
#> 
#> [[6]]
#> # A tibble: 6 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia     5            0.778 
#> 2 phenotype Australia    25            0.892 
#> 3 phenotype Australia    35            0.612 
#> 4 phenotype Australia    38            0.0343
#> 5 phenotype Australia    40            0.0784
#> 6 phenotype Australia    49            0.226 
#> 
#> [[7]]
#> # A tibble: 5 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia    13             0.124
#> 2 phenotype Australia    17             0.960
#> 3 phenotype Australia    31             0.788
#> 4 phenotype Australia    44             0.288
#> 5 phenotype Australia    46             0.216
#> 
#> [[8]]
#> # A tibble: 4 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia    16             0.179
#> 2 phenotype Australia    41             0.270
#> 3 phenotype Australia    42             0.960
#> 4 phenotype Australia    48             0.164
#> 
#> [[9]]
#> # A tibble: 6 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia     1             0.893
#> 2 phenotype Australia     8             0.172
#> 3 phenotype Australia    23             0.316
#> 4 phenotype Australia    24             0.372
#> 5 phenotype Australia    47             0.851
#> 6 phenotype Australia    50             0.711
#> 
#> [[10]]
#> # A tibble: 5 × 4
#>   type      country   month percent_mortality
#>   <chr>     <chr>     <int>             <dbl>
#> 1 phenotype Australia     3             0.158
#> 2 phenotype Australia    12             0.796
#> 3 phenotype Australia    27             0.254
#> 4 phenotype Australia    28             0.754
#> 5 phenotype Australia    32             0.350
n_pheno(predict_data_nstar_larger)
#>  [1] 7 5 2 5 5 6 5 4 6 5

Potential solutions

Created on 2024-02-29 with reprex v2.1.0

Session info ``` r sessioninfo::session_info() #> ─ Session info ─────────────────────────────────────────────────────────────── #> setting value #> version R version 4.3.2 (2023-10-31) #> os macOS Sonoma 14.3.1 #> system aarch64, darwin20 #> ui X11 #> language (EN) #> collate en_US.UTF-8 #> ctype en_US.UTF-8 #> tz Australia/Hobart #> date 2024-02-29 #> pandoc 3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown) #> #> ─ Packages ─────────────────────────────────────────────────────────────────── #> package * version date (UTC) lib source #> backports 1.4.1 2021-12-13 [1] CRAN (R 4.3.0) #> broom * 1.0.5 2023-06-09 [1] CRAN (R 4.3.0) #> class 7.3-22 2023-05-03 [2] CRAN (R 4.3.2) #> cli 3.6.2 2023-12-11 [1] CRAN (R 4.3.1) #> codetools 0.2-19 2023-02-01 [2] CRAN (R 4.3.2) #> colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.3.0) #> data.table 1.15.0 2024-01-30 [1] CRAN (R 4.3.1) #> dials * 1.2.0 2023-04-03 [2] CRAN (R 4.3.0) #> DiceDesign 1.10 2023-12-07 [2] CRAN (R 4.3.1) #> digest 0.6.34 2024-01-11 [1] CRAN (R 4.3.1) #> dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.3.1) #> evaluate 0.23 2023-11-01 [1] CRAN (R 4.3.1) #> fansi 1.0.6 2023-12-08 [1] CRAN (R 4.3.1) #> fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.0) #> forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.3.0) #> foreach 1.5.2 2022-02-02 [1] CRAN (R 4.3.0) #> fs 1.6.3 2023-07-20 [1] CRAN (R 4.3.0) #> furrr 0.3.1 2022-08-15 [2] CRAN (R 4.3.0) #> future 1.33.1 2023-12-22 [2] CRAN (R 4.3.1) #> future.apply 1.11.1 2023-12-21 [2] CRAN (R 4.3.1) #> generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0) #> ggplot2 * 3.4.4 2023-10-12 [1] CRAN (R 4.3.1) #> globals 0.16.2 2022-11-21 [2] CRAN (R 4.3.0) #> glue 1.7.0 2024-01-09 [1] CRAN (R 4.3.1) #> gower 1.0.1 2022-12-22 [2] CRAN (R 4.3.0) #> GPfit 1.0-8 2019-02-08 [2] CRAN (R 4.3.0) #> gtable 0.3.4 2023-08-21 [1] CRAN (R 4.3.0) #> hardhat 1.3.1 2024-02-02 [1] CRAN (R 4.3.1) #> hms 1.1.3 2023-03-21 [1] CRAN (R 4.3.0) #> htmltools 0.5.7 2023-11-03 [1] CRAN (R 4.3.1) #> infer * 1.0.6 2024-01-31 [2] CRAN (R 4.3.1) #> ipred 0.9-14 2023-03-09 [2] CRAN (R 4.3.0) #> iterators 1.0.14 2022-02-05 [1] CRAN (R 4.3.0) #> knitr 1.45 2023-10-30 [1] CRAN (R 4.3.1) #> lattice 0.22-5 2023-10-24 [1] CRAN (R 4.3.1) #> lava 1.7.3 2023-11-04 [2] CRAN (R 4.3.1) #> lhs 1.1.6 2022-12-17 [2] CRAN (R 4.3.0) #> lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.1) #> listenv 0.9.1 2024-01-29 [2] CRAN (R 4.3.1) #> lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.3.1) #> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0) #> MASS 7.3-60.0.1 2024-01-13 [1] CRAN (R 4.3.1) #> Matrix 1.6-5 2024-01-11 [1] CRAN (R 4.3.1) #> modeldata * 1.3.0 2024-01-21 [2] CRAN (R 4.3.1) #> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.3.0) #> nnet 7.3-19 2023-05-03 [2] CRAN (R 4.3.2) #> parallelly 1.37.0 2024-02-14 [1] CRAN (R 4.3.1) #> parsnip * 1.2.0 2024-02-16 [1] CRAN (R 4.3.1) #> pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.0) #> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0) #> prodlim 2023.08.28 2023-08-28 [2] CRAN (R 4.3.0) #> purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.3.0) #> R.cache 0.16.0 2022-07-21 [2] CRAN (R 4.3.0) #> R.methodsS3 1.8.2 2022-06-13 [2] CRAN (R 4.3.0) #> R.oo 1.26.0 2024-01-24 [2] CRAN (R 4.3.1) #> R.utils 2.12.3 2023-11-18 [2] CRAN (R 4.3.1) #> R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0) #> Rcpp 1.0.12 2024-01-09 [1] CRAN (R 4.3.1) #> readr * 2.1.5 2024-01-10 [1] CRAN (R 4.3.1) #> recipes * 1.0.10 2024-02-18 [1] CRAN (R 4.3.1) #> reprex 2.1.0 2024-01-11 [2] CRAN (R 4.3.1) #> rlang 1.1.3 2024-01-10 [1] CRAN (R 4.3.1) #> rmarkdown 2.25 2023-09-18 [1] CRAN (R 4.3.1) #> rpart 4.1.23 2023-12-05 [1] CRAN (R 4.3.1) #> rsample * 1.2.0 2023-08-23 [2] CRAN (R 4.3.0) #> rstudioapi 0.15.0 2023-07-07 [1] CRAN (R 4.3.0) #> scales * 1.3.0 2023-11-28 [1] CRAN (R 4.3.1) #> sessioninfo 1.2.2 2021-12-06 [2] CRAN (R 4.3.0) #> stringi 1.8.3 2023-12-11 [1] CRAN (R 4.3.1) #> stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.3.1) #> styler 1.10.2 2023-08-29 [2] CRAN (R 4.3.0) #> survival 3.5-8 2024-02-14 [1] CRAN (R 4.3.1) #> tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.3.0) #> tidymodels * 1.1.1 2023-08-24 [2] CRAN (R 4.3.0) #> tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.3.1) #> tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.3.0) #> tidyverse * 2.0.0 2023-02-22 [2] CRAN (R 4.3.0) #> timechange 0.3.0 2024-01-18 [1] CRAN (R 4.3.1) #> timeDate 4032.109 2023-12-14 [1] CRAN (R 4.3.1) #> tune * 1.1.2 2023-08-23 [2] CRAN (R 4.3.0) #> tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.3.0) #> utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.1) #> vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.1) #> withr 3.0.0 2024-01-16 [1] CRAN (R 4.3.1) #> workflows * 1.1.4 2024-02-19 [1] CRAN (R 4.3.1) #> workflowsets * 1.0.1 2023-04-06 [2] CRAN (R 4.3.0) #> xfun 0.42 2024-02-08 [1] CRAN (R 4.3.1) #> yaml 2.3.8 2023-12-11 [1] CRAN (R 4.3.1) #> yardstick * 1.3.0 2024-01-19 [1] CRAN (R 4.3.1) #> #> [1] /Users/nick/Library/R/arm64/4.3/library #> [2] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library #> #> ────────────────────────────────────────────────────────────────────────────── ```
njtierney commented 8 months ago

This should probably just be a checking function before we do the inner loop step