Closed GuyReeves closed 2 years ago
This should be possible, see the below test example, where we first have bam files in folderA and folderB, then make a new folderC with the combined desired input
library("STITCH")
n_snps <- 10
reads_span_n_snps <- 6
chr <- 1
n_reads <- 5 / (reads_span_n_snps / n_snps) ## want about 5X / sample
extension <- c("bgvcf" = ".vcf.gz", "bgen" = ".bgen")
phasemaster <- matrix(c(rep(0, n_snps), rep(1, n_snps)), ncol = 2)
phasemaster[2, ] <- c(1, 0)
phasemaster[3, ] <- c(0, 0)
phasemaster[7, ] <- c(1, 0)
data_package <- make_acceptance_test_data_package(
n_samples = 10,
n_snps = n_snps,
n_reads = n_reads,
seed = 77,
chr = "chr5",
K = 2,
reads_span_n_snps = reads_span_n_snps,
phasemaster = phasemaster
)
outputdir <- tempdir()
bamfile1 <- tempfile()
bamfile2 <- tempfile()
system(paste0("head -n9 ", data_package$bamlist, " > ", bamfile1))
system(paste0("tail -n1 ", data_package$bamlist, " > ", bamfile2))
STITCH(
chr = data_package$chr,
bamlist = bamfile1,
posfile = data_package$posfile,
outputdir = file.path(outputdir, "folderA"),
generateInputOnly = TRUE,
K = 2,
S = 2,
nGen = 100,
nCores = 1
)
STITCH(
chr = data_package$chr,
bamlist = bamfile2,
posfile = data_package$posfile,
outputdir = file.path(outputdir, "folderB"),
generateInputOnly = TRUE,
K = 2,
S = 2,
nGen = 100,
nCores = 1
)
## set up input files
dir.create(file.path(outputdir, "folderC", "input"), recursive = TRUE)
dir.create(file.path(outputdir, "folderC", "RData"), recursive = TRUE)
## copy bams
system(paste0(
"cp ",
file.path(outputdir, "folderA", "input"), "/* ",
file.path(outputdir, "folderC", "input"), "/"
))
system(paste0(
"cp ",
file.path(outputdir, "folderB", "input"), "/sample.1.input.chr5.RData", " ",
file.path(outputdir, "folderC", "input"), "/sample.10.input.chr5.RData"
))
## sample names fixing
load(file.path(outputdir, "folderB", "RData", "sampleNames.chr5.RData"))
extra_sampleNames <- sampleNames
load(file.path(outputdir, "folderA", "RData", "sampleNames.chr5.RData"))
sampleNames <- c(sampleNames, extra_sampleNames)
save(sampleNames, file = file.path(outputdir, "folderC", "RData", "sampleNames.chr5.RData"))
STITCH(
chr = data_package$chr,
posfile = data_package$posfile,
genfile = data_package$genfile,
outputdir = file.path(outputdir, "folderC"),
regenerateInput = FALSE,
originalRegionName = data_package$chr,
regenerateInputWithDefaultValues = TRUE,
K = 2,
S = 2,
nGen = 100,
nCores = 1
)
Thanks that is very very useful.
HI
If I have used "--generateInputOnly" to generate input for a large number of .BAM files (n=8000), if I wanted to add a single BAM file later that I forgot. Is there an easy way to be able to use "--generateInputOnly" on the single file and then some how manually merge it into the existing big input folders.If it is possible
If it is very complicated, forget it I will just wait the 24hours to regenerate the +1 dataset.
Thanks
Guy
You appeared to mention something like this might be possible, but I could not see how https://github.com/rwdavies/STITCH/issues/42 --generateInputOnly, to just process the BAM and quit, over each BAM?