Open njtierney opened 9 months ago
Related / demonstration of this:
library(tidyverse)
library(tidymodels)
create_data <- function(n){
tibble(
country = "Australia",
month = seq_len(n),
percent_mortality = runif(n)
)
}
dat_ph <- create_data(5)
dat_geno <- create_data(95)
dat_ph
#> # A tibble: 5 × 3
#> country month percent_mortality
#> <chr> <int> <dbl>
#> 1 Australia 1 0.328
#> 2 Australia 2 0.238
#> 3 Australia 3 0.587
#> 4 Australia 4 0.105
#> 5 Australia 5 0.804
dat_geno
#> # A tibble: 95 × 3
#> country month percent_mortality
#> <chr> <int> <dbl>
#> 1 Australia 1 0.0681
#> 2 Australia 2 0.689
#> 3 Australia 3 0.665
#> 4 Australia 4 0.940
#> 5 Australia 5 0.969
#> 6 Australia 6 0.923
#> 7 Australia 7 0.471
#> 8 Australia 8 0.775
#> 9 Australia 9 0.274
#> 10 Australia 10 0.525
#> # ℹ 85 more rows
ir_data_mn_star <- bind_rows(
phenotype = dat_ph,
genotype = dat_geno,
.id = "type"
)
ir_data_mn_star
#> # A tibble: 100 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 2 0.238
#> 3 phenotype Australia 3 0.587
#> 4 phenotype Australia 4 0.105
#> 5 phenotype Australia 5 0.804
#> 6 genotype Australia 1 0.0681
#> 7 genotype Australia 2 0.689
#> 8 genotype Australia 3 0.665
#> 9 genotype Australia 4 0.940
#> 10 genotype Australia 5 0.969
#> # ℹ 90 more rows
train_predict <- vfold_cv(
data = ir_data_mn_star,
v = 10
)
train_predict
#> # 10-fold cross-validation
#> # A tibble: 10 × 2
#> splits id
#> <list> <chr>
#> 1 <split [90/10]> Fold01
#> 2 <split [90/10]> Fold02
#> 3 <split [90/10]> Fold03
#> 4 <split [90/10]> Fold04
#> 5 <split [90/10]> Fold05
#> 6 <split [90/10]> Fold06
#> 7 <split [90/10]> Fold07
#> 8 <split [90/10]> Fold08
#> 9 <split [90/10]> Fold09
#> 10 <split [90/10]> Fold10
extract_training <- function(data) map(data$splits, training)
extract_predict <- function(data) {
map(data$splits, testing) %>%
map(\(x) filter(x, type == "phenotype"))
}
n_pheno <- function(data){
map_dbl(
data,
function(x) {
type <- x$type
sum(type == "phenotype")
}
)
}
# this should be N* + M*
train_data_mn_star <- extract_training(train_predict)
train_data_mn_star
#> [[1]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 2 0.238
#> 3 phenotype Australia 3 0.587
#> 4 phenotype Australia 4 0.105
#> 5 phenotype Australia 5 0.804
#> 6 genotype Australia 1 0.0681
#> 7 genotype Australia 2 0.689
#> 8 genotype Australia 3 0.665
#> 9 genotype Australia 4 0.940
#> 10 genotype Australia 5 0.969
#> # ℹ 80 more rows
#>
#> [[2]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 2 0.238
#> 3 phenotype Australia 3 0.587
#> 4 phenotype Australia 4 0.105
#> 5 phenotype Australia 5 0.804
#> 6 genotype Australia 1 0.0681
#> 7 genotype Australia 2 0.689
#> 8 genotype Australia 4 0.940
#> 9 genotype Australia 5 0.969
#> 10 genotype Australia 6 0.923
#> # ℹ 80 more rows
#>
#> [[3]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 2 0.238
#> 3 phenotype Australia 3 0.587
#> 4 phenotype Australia 4 0.105
#> 5 genotype Australia 1 0.0681
#> 6 genotype Australia 3 0.665
#> 7 genotype Australia 4 0.940
#> 8 genotype Australia 5 0.969
#> 9 genotype Australia 6 0.923
#> 10 genotype Australia 7 0.471
#> # ℹ 80 more rows
#>
#> [[4]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 3 0.587
#> 3 phenotype Australia 4 0.105
#> 4 phenotype Australia 5 0.804
#> 5 genotype Australia 1 0.0681
#> 6 genotype Australia 2 0.689
#> 7 genotype Australia 3 0.665
#> 8 genotype Australia 4 0.940
#> 9 genotype Australia 5 0.969
#> 10 genotype Australia 6 0.923
#> # ℹ 80 more rows
#>
#> [[5]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 2 0.238
#> 3 phenotype Australia 3 0.587
#> 4 phenotype Australia 4 0.105
#> 5 phenotype Australia 5 0.804
#> 6 genotype Australia 1 0.0681
#> 7 genotype Australia 2 0.689
#> 8 genotype Australia 3 0.665
#> 9 genotype Australia 4 0.940
#> 10 genotype Australia 5 0.969
#> # ℹ 80 more rows
#>
#> [[6]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 2 0.238
#> 2 phenotype Australia 3 0.587
#> 3 phenotype Australia 4 0.105
#> 4 phenotype Australia 5 0.804
#> 5 genotype Australia 1 0.0681
#> 6 genotype Australia 2 0.689
#> 7 genotype Australia 3 0.665
#> 8 genotype Australia 4 0.940
#> 9 genotype Australia 5 0.969
#> 10 genotype Australia 6 0.923
#> # ℹ 80 more rows
#>
#> [[7]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 2 0.238
#> 3 phenotype Australia 3 0.587
#> 4 phenotype Australia 5 0.804
#> 5 genotype Australia 1 0.0681
#> 6 genotype Australia 2 0.689
#> 7 genotype Australia 3 0.665
#> 8 genotype Australia 4 0.940
#> 9 genotype Australia 5 0.969
#> 10 genotype Australia 6 0.923
#> # ℹ 80 more rows
#>
#> [[8]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 2 0.238
#> 3 phenotype Australia 3 0.587
#> 4 phenotype Australia 4 0.105
#> 5 phenotype Australia 5 0.804
#> 6 genotype Australia 2 0.689
#> 7 genotype Australia 3 0.665
#> 8 genotype Australia 4 0.940
#> 9 genotype Australia 5 0.969
#> 10 genotype Australia 7 0.471
#> # ℹ 80 more rows
#>
#> [[9]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 2 0.238
#> 3 phenotype Australia 4 0.105
#> 4 phenotype Australia 5 0.804
#> 5 genotype Australia 1 0.0681
#> 6 genotype Australia 2 0.689
#> 7 genotype Australia 3 0.665
#> 8 genotype Australia 5 0.969
#> 9 genotype Australia 6 0.923
#> 10 genotype Australia 7 0.471
#> # ℹ 80 more rows
#>
#> [[10]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 2 0.238
#> 3 phenotype Australia 3 0.587
#> 4 phenotype Australia 4 0.105
#> 5 phenotype Australia 5 0.804
#> 6 genotype Australia 1 0.0681
#> 7 genotype Australia 2 0.689
#> 8 genotype Australia 3 0.665
#> 9 genotype Australia 4 0.940
#> 10 genotype Australia 6 0.923
#> # ℹ 80 more rows
n_pheno(train_data_mn_star)
#> [1] 5 5 4 4 5 4 4 5 4 5
# this should just be N*
predict_data_nstar <- extract_predict(train_predict)
predict_data_nstar
#> [[1]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> # percent_mortality <dbl>
#>
#> [[2]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> # percent_mortality <dbl>
#>
#> [[3]]
#> # A tibble: 1 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 5 0.804
#>
#> [[4]]
#> # A tibble: 1 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 2 0.238
#>
#> [[5]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> # percent_mortality <dbl>
#>
#> [[6]]
#> # A tibble: 1 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#>
#> [[7]]
#> # A tibble: 1 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 4 0.105
#>
#> [[8]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> # percent_mortality <dbl>
#>
#> [[9]]
#> # A tibble: 1 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 3 0.587
#>
#> [[10]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> # percent_mortality <dbl>
n_pheno(predict_data_nstar)
#> [1] 0 0 1 1 0 1 1 0 1 0
# maybe if we try using strata? --- slightly better?
train_predict_strata <- vfold_cv(
data = ir_data_mn_star,
v = 10,
strata = type
)
# this should be N* + M*
train_data_mn_star_strata <- extract_training(train_predict_strata)
train_data_mn_star_strata
#> [[1]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 2 0.238
#> 3 phenotype Australia 3 0.587
#> 4 phenotype Australia 4 0.105
#> 5 phenotype Australia 5 0.804
#> 6 genotype Australia 1 0.0681
#> 7 genotype Australia 3 0.665
#> 8 genotype Australia 4 0.940
#> 9 genotype Australia 5 0.969
#> 10 genotype Australia 6 0.923
#> # ℹ 80 more rows
#>
#> [[2]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 2 0.238
#> 3 phenotype Australia 3 0.587
#> 4 phenotype Australia 4 0.105
#> 5 phenotype Australia 5 0.804
#> 6 genotype Australia 1 0.0681
#> 7 genotype Australia 2 0.689
#> 8 genotype Australia 3 0.665
#> 9 genotype Australia 4 0.940
#> 10 genotype Australia 5 0.969
#> # ℹ 80 more rows
#>
#> [[3]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 2 0.238
#> 3 phenotype Australia 5 0.804
#> 4 genotype Australia 1 0.0681
#> 5 genotype Australia 2 0.689
#> 6 genotype Australia 3 0.665
#> 7 genotype Australia 4 0.940
#> 8 genotype Australia 5 0.969
#> 9 genotype Australia 6 0.923
#> 10 genotype Australia 7 0.471
#> # ℹ 80 more rows
#>
#> [[4]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 2 0.238
#> 3 phenotype Australia 3 0.587
#> 4 phenotype Australia 4 0.105
#> 5 phenotype Australia 5 0.804
#> 6 genotype Australia 1 0.0681
#> 7 genotype Australia 2 0.689
#> 8 genotype Australia 3 0.665
#> 9 genotype Australia 4 0.940
#> 10 genotype Australia 5 0.969
#> # ℹ 80 more rows
#>
#> [[5]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 2 0.238
#> 3 phenotype Australia 3 0.587
#> 4 phenotype Australia 4 0.105
#> 5 phenotype Australia 5 0.804
#> 6 genotype Australia 1 0.0681
#> 7 genotype Australia 2 0.689
#> 8 genotype Australia 3 0.665
#> 9 genotype Australia 4 0.940
#> 10 genotype Australia 5 0.969
#> # ℹ 80 more rows
#>
#> [[6]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 2 0.238
#> 3 phenotype Australia 3 0.587
#> 4 phenotype Australia 4 0.105
#> 5 phenotype Australia 5 0.804
#> 6 genotype Australia 1 0.0681
#> 7 genotype Australia 2 0.689
#> 8 genotype Australia 3 0.665
#> 9 genotype Australia 4 0.940
#> 10 genotype Australia 5 0.969
#> # ℹ 80 more rows
#>
#> [[7]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 2 0.238
#> 3 phenotype Australia 3 0.587
#> 4 phenotype Australia 4 0.105
#> 5 phenotype Australia 5 0.804
#> 6 genotype Australia 2 0.689
#> 7 genotype Australia 3 0.665
#> 8 genotype Australia 5 0.969
#> 9 genotype Australia 6 0.923
#> 10 genotype Australia 8 0.775
#> # ℹ 80 more rows
#>
#> [[8]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 2 0.238
#> 3 phenotype Australia 3 0.587
#> 4 phenotype Australia 4 0.105
#> 5 phenotype Australia 5 0.804
#> 6 genotype Australia 1 0.0681
#> 7 genotype Australia 2 0.689
#> 8 genotype Australia 4 0.940
#> 9 genotype Australia 5 0.969
#> 10 genotype Australia 6 0.923
#> # ℹ 80 more rows
#>
#> [[9]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 2 0.238
#> 2 phenotype Australia 3 0.587
#> 3 phenotype Australia 4 0.105
#> 4 genotype Australia 1 0.0681
#> 5 genotype Australia 2 0.689
#> 6 genotype Australia 3 0.665
#> 7 genotype Australia 4 0.940
#> 8 genotype Australia 6 0.923
#> 9 genotype Australia 7 0.471
#> 10 genotype Australia 8 0.775
#> # ℹ 80 more rows
#>
#> [[10]]
#> # A tibble: 90 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 3 0.587
#> 3 phenotype Australia 4 0.105
#> 4 phenotype Australia 5 0.804
#> 5 genotype Australia 1 0.0681
#> 6 genotype Australia 2 0.689
#> 7 genotype Australia 3 0.665
#> 8 genotype Australia 4 0.940
#> 9 genotype Australia 5 0.969
#> 10 genotype Australia 6 0.923
#> # ℹ 80 more rows
n_pheno(train_data_mn_star_strata)
#> [1] 5 5 3 5 5 5 5 5 3 4
# this should just be N*
predict_data_nstar_strata <- extract_predict(train_predict_strata)
predict_data_nstar_strata
#> [[1]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> # percent_mortality <dbl>
#>
#> [[2]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> # percent_mortality <dbl>
#>
#> [[3]]
#> # A tibble: 2 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 3 0.587
#> 2 phenotype Australia 4 0.105
#>
#> [[4]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> # percent_mortality <dbl>
#>
#> [[5]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> # percent_mortality <dbl>
#>
#> [[6]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> # percent_mortality <dbl>
#>
#> [[7]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> # percent_mortality <dbl>
#>
#> [[8]]
#> # A tibble: 0 × 4
#> # ℹ 4 variables: type <chr>, country <chr>, month <int>,
#> # percent_mortality <dbl>
#>
#> [[9]]
#> # A tibble: 2 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.328
#> 2 phenotype Australia 5 0.804
#>
#> [[10]]
#> # A tibble: 1 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 2 0.238
n_pheno(predict_data_nstar_strata)
#> [1] 0 0 2 0 0 0 0 0 2 1
## OK but what if we increase the data size?
dat_ph_larger <- create_data(50)
dat_ph_larger
#> # A tibble: 50 × 3
#> country month percent_mortality
#> <chr> <int> <dbl>
#> 1 Australia 1 0.893
#> 2 Australia 2 0.962
#> 3 Australia 3 0.158
#> 4 Australia 4 0.380
#> 5 Australia 5 0.778
#> 6 Australia 6 0.805
#> 7 Australia 7 0.0621
#> 8 Australia 8 0.172
#> 9 Australia 9 0.540
#> 10 Australia 10 0.412
#> # ℹ 40 more rows
dat_geno_larger <- create_data(950)
dat_geno_larger
#> # A tibble: 950 × 3
#> country month percent_mortality
#> <chr> <int> <dbl>
#> 1 Australia 1 0.227
#> 2 Australia 2 0.582
#> 3 Australia 3 0.108
#> 4 Australia 4 0.0219
#> 5 Australia 5 1.00
#> 6 Australia 6 0.645
#> 7 Australia 7 0.0702
#> 8 Australia 8 0.533
#> 9 Australia 9 0.00756
#> 10 Australia 10 0.0439
#> # ℹ 940 more rows
ir_data_mn_star_larger <- bind_rows(
phenotype = dat_ph_larger,
genotype = dat_geno_larger,
.id = "type"
)
ir_data_mn_star_larger
#> # A tibble: 1,000 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.893
#> 2 phenotype Australia 2 0.962
#> 3 phenotype Australia 3 0.158
#> 4 phenotype Australia 4 0.380
#> 5 phenotype Australia 5 0.778
#> 6 phenotype Australia 6 0.805
#> 7 phenotype Australia 7 0.0621
#> 8 phenotype Australia 8 0.172
#> 9 phenotype Australia 9 0.540
#> 10 phenotype Australia 10 0.412
#> # ℹ 990 more rows
train_predict_larger <- vfold_cv(
data = ir_data_mn_star_larger,
v = 10
)
train_predict_larger
#> # 10-fold cross-validation
#> # A tibble: 10 × 2
#> splits id
#> <list> <chr>
#> 1 <split [900/100]> Fold01
#> 2 <split [900/100]> Fold02
#> 3 <split [900/100]> Fold03
#> 4 <split [900/100]> Fold04
#> 5 <split [900/100]> Fold05
#> 6 <split [900/100]> Fold06
#> 7 <split [900/100]> Fold07
#> 8 <split [900/100]> Fold08
#> 9 <split [900/100]> Fold09
#> 10 <split [900/100]> Fold10
# this should be N* + M*
train_data_mn_star_larger <- extract_training(train_predict_larger)
train_data_mn_star_larger
#> [[1]]
#> # A tibble: 900 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.893
#> 2 phenotype Australia 2 0.962
#> 3 phenotype Australia 3 0.158
#> 4 phenotype Australia 4 0.380
#> 5 phenotype Australia 5 0.778
#> 6 phenotype Australia 6 0.805
#> 7 phenotype Australia 7 0.0621
#> 8 phenotype Australia 8 0.172
#> 9 phenotype Australia 10 0.412
#> 10 phenotype Australia 11 0.188
#> # ℹ 890 more rows
#>
#> [[2]]
#> # A tibble: 900 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.893
#> 2 phenotype Australia 2 0.962
#> 3 phenotype Australia 3 0.158
#> 4 phenotype Australia 4 0.380
#> 5 phenotype Australia 5 0.778
#> 6 phenotype Australia 6 0.805
#> 7 phenotype Australia 7 0.0621
#> 8 phenotype Australia 8 0.172
#> 9 phenotype Australia 9 0.540
#> 10 phenotype Australia 10 0.412
#> # ℹ 890 more rows
#>
#> [[3]]
#> # A tibble: 900 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.893
#> 2 phenotype Australia 2 0.962
#> 3 phenotype Australia 3 0.158
#> 4 phenotype Australia 4 0.380
#> 5 phenotype Australia 5 0.778
#> 6 phenotype Australia 6 0.805
#> 7 phenotype Australia 7 0.0621
#> 8 phenotype Australia 8 0.172
#> 9 phenotype Australia 9 0.540
#> 10 phenotype Australia 10 0.412
#> # ℹ 890 more rows
#>
#> [[4]]
#> # A tibble: 900 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.893
#> 2 phenotype Australia 2 0.962
#> 3 phenotype Australia 3 0.158
#> 4 phenotype Australia 4 0.380
#> 5 phenotype Australia 5 0.778
#> 6 phenotype Australia 7 0.0621
#> 7 phenotype Australia 8 0.172
#> 8 phenotype Australia 9 0.540
#> 9 phenotype Australia 10 0.412
#> 10 phenotype Australia 11 0.188
#> # ℹ 890 more rows
#>
#> [[5]]
#> # A tibble: 900 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.893
#> 2 phenotype Australia 3 0.158
#> 3 phenotype Australia 5 0.778
#> 4 phenotype Australia 6 0.805
#> 5 phenotype Australia 8 0.172
#> 6 phenotype Australia 9 0.540
#> 7 phenotype Australia 11 0.188
#> 8 phenotype Australia 12 0.796
#> 9 phenotype Australia 13 0.124
#> 10 phenotype Australia 14 0.705
#> # ℹ 890 more rows
#>
#> [[6]]
#> # A tibble: 900 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.893
#> 2 phenotype Australia 2 0.962
#> 3 phenotype Australia 3 0.158
#> 4 phenotype Australia 4 0.380
#> 5 phenotype Australia 6 0.805
#> 6 phenotype Australia 7 0.0621
#> 7 phenotype Australia 8 0.172
#> 8 phenotype Australia 9 0.540
#> 9 phenotype Australia 10 0.412
#> 10 phenotype Australia 11 0.188
#> # ℹ 890 more rows
#>
#> [[7]]
#> # A tibble: 900 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.893
#> 2 phenotype Australia 2 0.962
#> 3 phenotype Australia 3 0.158
#> 4 phenotype Australia 4 0.380
#> 5 phenotype Australia 5 0.778
#> 6 phenotype Australia 6 0.805
#> 7 phenotype Australia 7 0.0621
#> 8 phenotype Australia 8 0.172
#> 9 phenotype Australia 9 0.540
#> 10 phenotype Australia 10 0.412
#> # ℹ 890 more rows
#>
#> [[8]]
#> # A tibble: 900 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.893
#> 2 phenotype Australia 2 0.962
#> 3 phenotype Australia 3 0.158
#> 4 phenotype Australia 4 0.380
#> 5 phenotype Australia 5 0.778
#> 6 phenotype Australia 6 0.805
#> 7 phenotype Australia 7 0.0621
#> 8 phenotype Australia 8 0.172
#> 9 phenotype Australia 9 0.540
#> 10 phenotype Australia 10 0.412
#> # ℹ 890 more rows
#>
#> [[9]]
#> # A tibble: 900 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 2 0.962
#> 2 phenotype Australia 3 0.158
#> 3 phenotype Australia 4 0.380
#> 4 phenotype Australia 5 0.778
#> 5 phenotype Australia 6 0.805
#> 6 phenotype Australia 7 0.0621
#> 7 phenotype Australia 9 0.540
#> 8 phenotype Australia 10 0.412
#> 9 phenotype Australia 11 0.188
#> 10 phenotype Australia 12 0.796
#> # ℹ 890 more rows
#>
#> [[10]]
#> # A tibble: 900 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.893
#> 2 phenotype Australia 2 0.962
#> 3 phenotype Australia 4 0.380
#> 4 phenotype Australia 5 0.778
#> 5 phenotype Australia 6 0.805
#> 6 phenotype Australia 7 0.0621
#> 7 phenotype Australia 8 0.172
#> 8 phenotype Australia 9 0.540
#> 9 phenotype Australia 10 0.412
#> 10 phenotype Australia 11 0.188
#> # ℹ 890 more rows
n_pheno(train_data_mn_star_larger)
#> [1] 43 45 48 45 45 44 45 46 44 45
# this should just be N*
predict_data_nstar_larger <- extract_predict(train_predict_larger)
predict_data_nstar_larger
#> [[1]]
#> # A tibble: 7 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 9 0.540
#> 2 phenotype Australia 18 0.882
#> 3 phenotype Australia 22 0.768
#> 4 phenotype Australia 30 0.205
#> 5 phenotype Australia 34 0.545
#> 6 phenotype Australia 36 0.120
#> 7 phenotype Australia 37 0.668
#>
#> [[2]]
#> # A tibble: 5 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 14 0.705
#> 2 phenotype Australia 20 0.771
#> 3 phenotype Australia 29 0.0224
#> 4 phenotype Australia 39 0.347
#> 5 phenotype Australia 45 0.368
#>
#> [[3]]
#> # A tibble: 2 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 11 0.188
#> 2 phenotype Australia 15 0.435
#>
#> [[4]]
#> # A tibble: 5 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 6 0.805
#> 2 phenotype Australia 21 0.356
#> 3 phenotype Australia 26 0.387
#> 4 phenotype Australia 33 0.936
#> 5 phenotype Australia 43 0.623
#>
#> [[5]]
#> # A tibble: 5 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 2 0.962
#> 2 phenotype Australia 4 0.380
#> 3 phenotype Australia 7 0.0621
#> 4 phenotype Australia 10 0.412
#> 5 phenotype Australia 19 0.122
#>
#> [[6]]
#> # A tibble: 6 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 5 0.778
#> 2 phenotype Australia 25 0.892
#> 3 phenotype Australia 35 0.612
#> 4 phenotype Australia 38 0.0343
#> 5 phenotype Australia 40 0.0784
#> 6 phenotype Australia 49 0.226
#>
#> [[7]]
#> # A tibble: 5 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 13 0.124
#> 2 phenotype Australia 17 0.960
#> 3 phenotype Australia 31 0.788
#> 4 phenotype Australia 44 0.288
#> 5 phenotype Australia 46 0.216
#>
#> [[8]]
#> # A tibble: 4 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 16 0.179
#> 2 phenotype Australia 41 0.270
#> 3 phenotype Australia 42 0.960
#> 4 phenotype Australia 48 0.164
#>
#> [[9]]
#> # A tibble: 6 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 1 0.893
#> 2 phenotype Australia 8 0.172
#> 3 phenotype Australia 23 0.316
#> 4 phenotype Australia 24 0.372
#> 5 phenotype Australia 47 0.851
#> 6 phenotype Australia 50 0.711
#>
#> [[10]]
#> # A tibble: 5 × 4
#> type country month percent_mortality
#> <chr> <chr> <int> <dbl>
#> 1 phenotype Australia 3 0.158
#> 2 phenotype Australia 12 0.796
#> 3 phenotype Australia 27 0.254
#> 4 phenotype Australia 28 0.754
#> 5 phenotype Australia 32 0.350
n_pheno(predict_data_nstar_larger)
#> [1] 7 5 2 5 5 6 5 4 6 5
Created on 2024-02-29 with reprex v2.1.0
This should probably just be a checking function before we do the inner loop step
Currently if you have about 5% phenotypic data, then later on when you are doing test/training datasets you end up running out of rows of data to subset down to.