caravagnalab / rRACES

R wrapper for the RACES package
GNU General Public License v3.0
2 stars 1 forks source link

Re-building from scratch the mutation engine is required to observe CN events #87

Closed jovoni closed 9 months ago

jovoni commented 9 months ago

I was looking at CN events and I encountered an issue. The setting is quite simple, I have a population that have a 2:1 event in a portion of the 22nd chromosome and therefore I expect the clonal mutations to be distributed around .33 and .66 of the VAF range. The problem is that if I do not rebuild the mutation engine from scratch (i.e. if I do not rebuild the Test folder) the VAF spectrum is wrong.

I report here two VAF spectra examples. The first one is perfect while the second one is wrong.

VAF_spectra_new_Test vaf_spectra_not_rebuilt

Here is the code to reproduce the error. To observe the bad behaviour is necessary to run the code at least once (in order to build the Test folder) and then re run it with the CLEAN_FOLDER flag set to False (at the top of the file)

library(rRACES)
library(dplyr)
library(ggplot2)
rm(list = ls())

set.seed(12345)
CLEAN_FOLDER <- FALSE

sim <- new(Simulation, "Border Growth")

# Set the "border" growth model
sim$duplicate_internal_cells <- FALSE

# Set the death activation level to avoid drift
sim$death_activation_level <- 50

# Add mutants
sim$add_mutant(name = "A", growth_rates = 0.1, death_rates = 0.01)
sim$place_cell("A", 500, 500)
sim$run_up_to_size("A", 4000)
sim$get_clock()

# Add "B" mutant
sim$add_mutant(name="B", growth_rates = 0.1, death_rates=.01)
sim$mutate_progeny(sim$choose_cell_in("A"), "B")
sim$run_up_to_size("B", 4000)

plot_tissue(sim, num_of_bins = 500)

# Sample cells ####
n_w <- n_h <- 20
ncells <- 0.8*n_w*n_h

bbox <- sim$search_sample(c("A" = ncells), n_w, n_h)
sim$sample_cells("S_A", bbox$lower_corner, bbox$upper_corner)

bbox <- sim$search_sample(c("B" = ncells), n_w, n_h)
sim$sample_cells("S_B", bbox$lower_corner, bbox$upper_corner)

plot_tissue(sim, num_of_bins = 500)

forest <- sim$get_samples_forest()

# Get genomics urls ####
reference_url <- paste0("https://ftp.ensembl.org/pub/grch37/current/",
                        "fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.",
                        "dna.chromosome.22.fa.gz")

SBS_url <- paste0("https://cancer.sanger.ac.uk/signatures/documents/2123/",
                  "COSMIC_v3.4_SBS_GRCh37.txt")

drivers_url <- paste0("https://raw.githubusercontent.com/",
                      "caravagnalab/rRACES/main/inst/extdata/",
                      "driver_mutations_hg19.csv")

passenger_CNAs_url <- paste0("https://raw.githubusercontent.com/",
                             "caravagnalab/rRACES/main/inst/extdata/",
                             "passenger_CNAs_hg19.csv")

germline_url <- paste0("https://www.dropbox.com/scl/fi/g9oloxkip18tr1r",
                       "m6wjve/germline_data_demo.tar.gz?rlkey=15jshul",
                       "d3bqgyfcs7fa0bzqeo&dl=1")

# build a mutation engine and place all the files in the directory "Test" ####
if (CLEAN_FOLDER) {
  unlink("Test/", recursive = TRUE)
}

m_engine <- build_mutation_engine(directory = "Test",
                                  reference_src = reference_url,
                                  SBS_src = SBS_url,
                                  drivers_src = drivers_url,
                                  passenger_CNAs_src = passenger_CNAs_url,
                                  germline_src = germline_url)

# Define Mutants ####
# B, son of A, will have a copy number event
# We remove passenger CNA
m_engine$add_mutant(
  mutant_name = "A",
  passenger_rates = c(SNV = 1e-7, CNA = 0),
  driver_SNVs = c(SNV("22", 10510210, "C"))
  #driver_CNAs = c(CNA(type = "A", "22", pos_in_chr = 10303470,len = 1e7))
)

m_engine$add_mutant("B",
                    passenger_rates = c(SNV=1e-7, CNA=1e-9),
                    driver_SNVs = c(SNV("22", 10510210, "C")),
                    driver_CNAs = c(CNA(type = "A", "22", pos_in_chr = 20303470,len = 1e7))
)

m_engine$add_exposure(coefficients = c(SBS13 = 0.4, SBS1 = 0.6))
phylo_forest <- m_engine$place_mutations(forest, 0)

# Simulate sequencing ####
seq_results <- simulate_seq(phylo_forest, coverage = 80)

seq_to_long <- function(seq_results) {
  # get names of samples
  sample_names <- strsplit(colnames(seq_results)[grepl("VAF", colnames(seq_results), fixed = TRUE)], ".VAF") %>% unlist()

  sn <- sample_names[1]
  seq_df <- lapply(sample_names, function(sn) {
    cc <- c("chromosome", "chr_pos", "ref", "alt", colnames(seq_results)[grepl(sn, colnames(seq_results), fixed = T)])

    seq_results[, cc] %>%
      `colnames<-`(c("chromosome", "chr_pos", "ref", "alt", "occurences", "coverage", "VAF")) %>%
      dplyr::mutate(sample_name = sn)
  }) %>% do.call("bind_rows", .)

  seq_df %>%
    dplyr::rename(chr = chromosome, from = chr_pos, DP = coverage, NV = occurences, ALT = alt) %>%
    dplyr::mutate(to = from) %>%
    dplyr::select(chr, from, to, ALT, NV, DP, VAF, sample_name)
}

extract_cna <- function(phylo_forest) {
  phylo_forest$get_sampled_cell_CNAs() %>%
    dplyr::distinct(type, chromosome, begin, end, allele, src.allele) %>%
    dplyr::rename(chr=chromosome, from=begin, to=end) %>%
    dplyr::mutate(Major = ifelse(type=="A", allele, 1), minor=ifelse(type=="A", 1, allele)) %>%
    dplyr::select(chr, from, to, Major, minor, type)
}

seq_df <- seq_to_long(seq_results %>% dplyr::filter(classes != "germinal"))

cnas <- extract_cna(phylo_forest)

cn <- cnas[1,]

if (CLEAN_FOLDER) {
  t <- "New Test folder from scratch"
} else {
  t <- "Test folder not been rebuilt"
}

seq_df %>%
  dplyr::filter(sample_name == "S_B") %>%
  dplyr::filter(from >= cn$from, to<=cn$to) %>%
  dplyr::filter(VAF >= .1) %>%
  ggplot(mapping = aes(x=VAF)) +
  geom_histogram(binwidth = .01) +
  xlim(c(0,1)) +
  facet_wrap(~ sample_name) +
  theme_bw() +
  geom_vline(xintercept = c(.33, .66), color="indianred") +
  ggtitle(t, "Two peaks are expected in the B population")

# sessionInfo()
# 
# R version 4.3.2 (2023-10-31)
# Platform: aarch64-apple-darwin20 (64-bit)
# Running under: macOS Sonoma 14.2.1
# 
# Matrix products: default
# BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
# LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
# 
# locale:
#   [1] en_US.UTF-8
# 
# time zone: Europe/Rome
# tzcode source: internal
# 
# attached base packages:
#   [1] stats     graphics  grDevices utils     datasets  methods   base     
# 
# other attached packages:
#   [1] patchwork_1.1.3 lubridate_1.9.3 forcats_1.0.0   stringr_1.5.1   purrr_1.0.2     readr_2.1.5    
# [7] tidyr_1.3.1     tibble_3.2.1    tidyverse_2.0.0 ggplot2_3.5.0   dplyr_1.1.4     rRACES_0.4.2   
# [13] devil_0.1.0    
# 
# loaded via a namespace (and not attached):
#   [1] RColorBrewer_1.1-3        tensorA_0.36.2.1          sys_3.4.2                 RcppArmadillo_0.12.8.1.0 
# [5] rstudioapi_0.15.0         jsonlite_1.8.8            magrittr_2.0.3            farver_2.1.1             
# [9] rmarkdown_2.25            fs_1.6.3                  zlibbioc_1.48.0           vctrs_0.6.5              
# [13] memoise_2.0.1             DelayedMatrixStats_1.24.0 askpass_1.2.0             gh_1.4.0                 
# [17] htmltools_0.5.7           S4Arrays_1.2.0            usethis_2.2.2             distributional_0.3.2     
# [21] curl_5.2.0                SparseArray_1.2.3         htmlwidgets_1.6.4         desc_1.4.3               
# [25] testthat_3.2.1            httr2_1.0.0               cachem_1.0.8              commonmark_1.9.0         
# [29] whisker_0.4.1             igraph_2.0.2              mime_0.12                 lifecycle_1.0.4          
# [33] pkgconfig_2.0.3           Matrix_1.6-4              R6_2.5.1                  fastmap_1.1.1            
# [37] rcmdcheck_1.4.0           MatrixGenerics_1.14.0     shiny_1.8.0               digest_0.6.34            
# [41] colorspace_2.1-0          S4Vectors_0.40.2          ps_1.7.6                  rprojroot_2.0.4          
# [45] pkgload_1.3.4             labeling_0.4.3            timechange_0.2.0          fansi_1.0.6              
# [49] polyclip_1.10-6           abind_1.4-5               compiler_4.3.2            remotes_2.4.2.1          
# [53] withr_3.0.0               backports_1.4.1           viridis_0.6.5             hexbin_1.28.3            
# [57] pkgbuild_1.4.3            R.utils_2.12.3            ggforce_0.4.2             MASS_7.3-60              
# [61] openssl_2.1.1             rappdirs_0.3.3            DelayedArray_0.28.0       sessioninfo_1.2.2        
# [65] tools_4.3.2               ape_5.7-1                 httpuv_1.6.13             clipr_0.8.0              
# [69] R.oo_1.26.0               glue_1.7.0                callr_3.7.3               nlme_3.1-164             
# [73] promises_1.2.1            cmdstanr_0.7.1            grid_4.3.2                checkmate_2.3.1          
# [77] generics_0.1.3            gtable_0.3.4              tzdb_0.4.0                R.methodsS3_1.8.2        
# [81] data.table_1.14.10        hms_1.1.3                 tidygraph_1.3.1           xml2_1.3.6               
# [85] utf8_1.2.4                XVector_0.42.0            BiocGenerics_0.48.1       ggrepel_0.9.5            
# [89] pillar_1.9.0              posterior_1.5.0           later_1.3.2               ggmuller_0.5.6           
# [93] tweenr_2.0.3              lattice_0.22-5            tidyselect_1.2.0          miniUI_0.1.1.1           
# [97] knitr_1.45                gridExtra_2.3             gitcreds_0.1.2            IRanges_2.36.0           
# [101] stats4_4.3.2              xfun_0.42                 graphlayouts_1.1.0        devtools_2.4.5           
# [105] credentials_2.0.1         brio_1.1.4                matrixStats_1.2.0         stringi_1.8.3            
# [109] xopen_1.0.0               yaml_2.3.8                evaluate_0.23             codetools_0.2-19         
# [113] RcppEigen_0.3.4.0.0       ggraph_2.2.0              cli_3.6.2                 xtable_1.8-4             
# [117] munsell_0.5.0             processx_3.8.3            roxygen2_7.2.3            Rcpp_1.0.12              
# [121] gert_2.0.1                parallel_4.3.2            ellipsis_0.3.2            pkgdown_2.0.7            
# [125] prettyunits_1.2.0         profvis_0.3.8             urlchecker_1.0.1          sparseMatrixStats_1.14.0 
# [129] viridisLite_0.4.2         scales_1.3.0              crayon_1.5.2              rlang_1.1.3