bodkan / slendr

Population genetic simulations in R 🌍
https://bodkan.net/slendr
Other
54 stars 5 forks source link

Fix two reported sampling issues #70

Closed bodkan closed 2 years ago

bodkan commented 2 years ago

This fixes two issues reported in #69:

Here is a small test case provided by @moicoll which first reported these issues. The R package can now correctly deal with those problems.

These issues are very complex, and even more so because slendr allows flexible time specification in forward and in backward direction which complicates the sanity checks quite a lot. However, I'll close the issue for now and wait for other bugs to pop up.

# fetch the latest slendr version with the bugfixes implemented
devtools::install_github("bodkan/slendr")

library(slendr)

set.seed(1234)

# this links to my Python setup -- you might do something else
reticulate::use_condaenv("retipy", required = TRUE)
reticulate::py_config()

map <- world(xrange = c(-15, 60), yrange = c(20, 65),crs = "EPSG:3035")

africa   <- region("Africa",   map, polygon = list(c(-18, 20), c(40, 20), c(30, 33),c(20, 32), c(10, 35), c(-8, 35)))
europe   <- region("Europe",   map, polygon = list(c(-8, 35), c(-5, 36), c(10, 38), c(20, 35), c(25, 35),c(33, 45), c(20, 58), c(-5, 60), c(-15, 50)))
anatolia <- region("Anatolia", map, polygon = list(c(28, 35), c(40, 35), c(42, 40), c(30, 43), c(27, 40), c(25, 38)))

afr <- population("AFR", parent = "ancestor", time = 52000, N = 3000, map = map, polygon = africa)
ooa <- population("OOA", parent = afr, time = 51000, N = 500, remove = 25000, center = c(33, 30), radius = 400e3) %>%
  move(trajectory = list(c(40, 30), c(50, 30), c(60, 40)), start = 50000, end = 40000, snapshots = 20)
ehg <- population("EHG", parent = ooa, time = 28000, N = 1000, remove = 6000, polygon = list(c(26, 55), c(38, 53), c(48, 53), c(60, 53), c(60, 60), c(48, 63), c(38, 63), c(26, 60)))
eur <- population( name = "EUR", parent = ehg, time = 25000, N = 2000, polygon = europe)
ana <- population( name = "ANA", time = 28000, N = 3000, parent = ooa, remove = 4000, center = c(34, 38), radius = 500e3, polygon = anatolia) %>%
  expand(by = 2500e3, start = 10000, end = 7000, polygon = join(europe, anatolia), snapshots = 20)
yam <- population( name = "YAM", time = 7000, N = 500, parent = ehg, remove = 2500, polygon = list(c(26, 50), c(38, 49), c(48, 50), c(48, 56), c(38, 59), c(26, 56))) %>%
  move(trajectory = list(c(15, 50)), start = 5000, end = 3000, snapshots = 10)

gf <- list(
  geneflow(from = ana, to = yam, rate = 0.5,  start = 6500, end = 6400, overlap = FALSE),
  geneflow(from = ana, to = eur, rate = 0.5,  start = 8000, end = 6000),
  geneflow(from = yam, to = eur, rate = 0.75, start = 4000, end = 3000))

model <- compile(
  populations      = list(afr, ooa, ehg, eur, ana, yam),
  geneflow         = gf,
  generation_time  = 30,
  resolution       = 10e3,
  competition_dist = 130e3, mate_dist = 100e3,
  dispersal_dist   = 70e3,
  dir              = "~/Desktop/moi-model", overwrite = TRUE)

samples1 <- slendr::sampling(
  model, times = runif(n = 1000, min = 2500, max = 52000),
  list(afr, 1), list(ooa, 1), list(ehg, 1), list(eur, 1), list(ana, 1)
)

print(paste("Nº of samples : ", sum(samples1$n), sep = ""))
samples1 %>% summary() %>% print()

# this no longer crashes because the sampling events inconsistent with
# population split times are now handled by `sampling()` internally
slendr::slim(model,
             sequence_length    = 1,
             recombination_rate = 0,
             sampling           = samples1,
             verbose            = TRUE,
             save_locations     = TRUE,
             method             = "batch",
             seed               = 1234,
             save_sampling      = TRUE)

# we no longer have the issue by some individuals gone missing, breaking
# the loading of tables of individuals
ts <- ts_load(model, recapitate = TRUE, simplify = TRUE,
              Ne = 10000, recombination_rate = 0)

# verify we get the same number of samples from the tree sequence
ts_samples(ts) %>% nrow()