Wedge-lab / battenberg

Battenberg R package for subclonal copynumber estimation
GNU Affero General Public License v3.0
78 stars 47 forks source link

Query about Battenberg Runtime #114

Closed lincj1994 closed 6 months ago

lincj1994 commented 11 months ago

Hi. @davidwedge @jcesar101 @sdentro

I have recently tested Battenberg on a paired sample, and it has taken approximately 14 hours. The log indicated that it was stuck at "Multi pos start: [1] "minCount=4"" for the entire duration. I would like to inquire about the expected runtime for Battenberg to better understand if this duration is within the expected range.

Could it be attributed to the configuration settings, specifically

--cpu beaglemaxmem beaglenthreads beaglewindow beagleoverlap

Thank you for your assistance in clarifying this matter.

Lin.

(batternberg) root@ubuntu:/usr/local# R CMD BATCH '--no-restore-data --no-save --args -t CBCGA406684.T -n CBCGA406684.N --nb /mnt/battenberg/CBCGA406684.N.sorted.deduped.bam --tb /mnt/battenberg/CBCGA406684.T.sorted.deduped.bam --sex female -o /mnt/battenberg/' /mnt/battenberg/battenberg_wgs.R /mnt/battenberg/battenberg_COME.Rout

.Rout

> library(Battenberg)
> library(optparse)
> 
> option_list = list(
+   make_option(c("-t", "--tumourname"), type="character", default=NULL, help="Samplename of the tumour", metavar="character"),
+   make_option(c("-n", "--normalname"), type="character", default=NULL, help="Samplename of the normal", metavar="character"),
+   make_option(c("--tb"), type="character", default=NULL, help="Tumour BAM file", metavar="character"),
+   make_option(c("--nb"), type="character", default=NULL, help="Normal BAM file", metavar="character"),
+   make_option(c("--sex"), type="character", default=NULL, help="Sex of the sample", metavar="character"),
+   make_option(c("-o", "--output"), type="character", default=NULL, help="Directory where output will be written", metavar="character"),
+   make_option(c("--skip_allelecount"), type="logical", default=FALSE, action="store_true", help="Provide when alleles don't have to be counted. This expects allelecount files on disk", metavar="character"),
+   make_option(c("--skip_preprocessing"), type="logical", default=FALSE, action="store_true", help="Provide when pre-processing has previously completed. This expects the files on disk", metavar="character"),
+   make_option(c("--skip_phasing"), type="logical", default=FALSE, action="store_true", help="Provide when phasing has previously completed. This expects the files on disk", metavar="character"),
+   make_option(c("--cpu"), type="numeric", default=8, help="The number of CPU cores to be used by the pipeline (Default: 8)", metavar="character"),
+   make_option(c("--bp"), type="character", default=NULL, help="Optional two column file (chromosome and position) specifying prior breakpoints to be used during segmentation", metavar="character")
+ )
> 
> opt_parser = OptionParser(option_list=option_list)
> opt = parse_args(opt_parser)
> 
> TUMOURNAME = opt$tumourname
> NORMALNAME = opt$normalname
> NORMALBAM = opt$nb
> TUMOURBAM = opt$tb
> IS.MALE = opt$sex=="female" | opt$sex=="Female"
> RUN_DIR = opt$output
> SKIP_ALLELECOUNTING = opt$skip_allelecount
> SKIP_PREPROCESSING = opt$skip_preprocessing
> SKIP_PHASING = opt$skip_phasing
> NTHREADS = opt$cpu
> PRIOR_BREAKPOINTS_FILE = opt$bp
> 
> analysis = "paired"
> 
> ###############################################################################
> # 2018-11-01
> # A pure R Battenberg v2.2.9 WGS pipeline implementation.
> # sd11 [at] sanger.ac.uk
> ###############################################################################
> 
> JAVAJRE = "java"
> ALLELECOUNTER = "alleleCounter"
> IMPUTE_EXE = "impute2"
> 
> GENOMEBUILD = "hg38"
> USEBEAGLE = T
> 
> # General static
> if (GENOMEBUILD=="hg19") {
+   impute_basedir = "/hps/research/gerstung/sdentro/reference/human/battenberg/"
+   IMPUTEINFOFILE = file.path(impute_basedir, "battenberg_impute_v3/impute_info.txt")
+   G1000ALLELESPREFIX = file.path(impute_basedir, "battenberg_1000genomesloci2012_v3/1000genomesAlleles2012_chr")
+   G1000LOCIPREFIX = file.path(impute_basedir, "battenberg_1000genomesloci2012_v3/1000genomesloci2012_chr")
+   GCCORRECTPREFIX = file.path(impute_basedir, "battenberg_wgs_gc_correction_1000g_v3_noNA/1000_genomes_GC_corr_chr_")
+   REPLICCORRECTPREFIX = file.path(impute_basedir, "battenberg_wgs_replic_correction_1000g_v3/1000_genomes_replication_timing_chr_")
+   
+   # WGS specific static
+   #PROBLEMLOCI = "/lustre/scratch117/casm/team219/sd11/reference/GenomeFiles/battenberg_probloci/probloci_270415.txt.gz"
+   PROBLEMLOCI = "/hps/research/gerstung/sdentro/reference/human/battenberg/battenberg_probloci/probloci_270415.txt.gz"
+   GENOME_VERSION = "b37"
+   GENOMEBUILD = "hg19"
+   #BEAGLE_BASEDIR = "/nfs/users/nfs_s/sd11/scratch17_t219/reference/GenomeFiles/battenberg_beagle"
+   BEAGLE_BASEDIR = "/hps/research/gerstung/sdentro/reference/human/battenberg/battenberg_beagle"
+   BEAGLEJAR = file.path(BEAGLE_BASEDIR, "beagle.24Aug19.3e8.jar")
+   BEAGLEREF.template = file.path(BEAGLE_BASEDIR, GENOME_VERSION, "chrCHROMNAME.1kg.phase3.v5a.b37.bref3")
+   BEAGLEPLINK.template = file.path(BEAGLE_BASEDIR, GENOME_VERSION, "plink.chrCHROMNAME.GRCh37.map")
+ 
+   CHROM_COORD_FILE = "/homes/sdentro/repo/battenberg/gcCorrect_chromosome_coordinates_hg19.txt"
+ 
+ } else if (GENOMEBUILD=="hg38") {
+   
+   BEAGLE_BASEDIR = "/opt/battenberg_reference/"
+   GENOMEBUILD = "hg38"
+   IMPUTEINFOFILE = file.path(BEAGLE_BASEDIR, "imputation/impute_info2.txt")
+   G1000ALLELESPREFIX = file.path(BEAGLE_BASEDIR, "1000G_loci_hg38/1kg.phase3.v5a_GRCh38nounref_allele_index_chr")
+   G1000LOCIPREFIX = file.path(BEAGLE_BASEDIR, "1000G_loci_hg38/1kg.phase3.v5a_GRCh38nounref_loci_chr")
+   GCCORRECTPREFIX = file.path(BEAGLE_BASEDIR, "GC_correction_hg38/1000G_GC_chr")
+   REPLICCORRECTPREFIX = file.path(BEAGLE_BASEDIR, "RT_correction_hg38/1000G_RT_chr")
+   PROBLEMLOCI = file.path(BEAGLE_BASEDIR, "probloci/probloci.txt.gz")
+   
+   BEAGLEREF.template = file.path(BEAGLE_BASEDIR, "beagle5/chrCHROMNAME.1kg.phase3.v5a_GRCh38nounref.vcf.gz")
+   BEAGLEPLINK.template = file.path(BEAGLE_BASEDIR, "beagle5/plink.chrCHROMNAME.GRCh38.map")
+   BEAGLEJAR = file.path(BEAGLE_BASEDIR, "beagle5/beagle.08Feb22.fa4.jar")
+ 
+   CHROM_COORD_FILE = "/opt/battenberg_reference/chromosome_coordinates_hg38.txt"
+ } 
> 
> PLATFORM_GAMMA = 1
> PHASING_GAMMA = 1
> SEGMENTATION_GAMMA = 10
> SEGMENTATIIN_KMIN = 3
> PHASING_KMIN = 1
> CLONALITY_DIST_METRIC = 0
> ASCAT_DIST_METRIC = 1
> MIN_PLOIDY = 1.3
> MAX_PLOIDY = 5.5
> MIN_RHO = 0.1
> MIN_GOODNESS_OF_FIT = 0.63
> BALANCED_THRESHOLD = 0.51
> MIN_NORMAL_DEPTH = 4
> MIN_BASE_QUAL = 20
> MIN_MAP_QUAL = 35
> CALC_SEG_BAF_OPTION = 1
> 
> 
> # Change to work directory and load the chromosome information
> setwd(RUN_DIR)
> 
> battenberg(analysis=analysis,
+      tumourname=TUMOURNAME, 
+            normalname=NORMALNAME, 
+            tumour_data_file=TUMOURBAM, 
+            normal_data_file=NORMALBAM, 
+            ismale=IS.MALE, 
+            imputeinfofile=IMPUTEINFOFILE, 
+            g1000prefix=G1000LOCIPREFIX, 
+            g1000allelesprefix=G1000ALLELESPREFIX, 
+            gccorrectprefix=GCCORRECTPREFIX, 
+            repliccorrectprefix=REPLICCORRECTPREFIX, 
+            problemloci=PROBLEMLOCI, 
+            data_type="wgs",
+            impute_exe=IMPUTE_EXE,
+            allelecounter_exe=ALLELECOUNTER,
+      usebeagle=USEBEAGLE,
+      beaglejar=BEAGLEJAR,
+      beagleref=BEAGLEREF.template,
+      beagleplink=BEAGLEPLINK.template,
+      beaglemaxmem=10,
+      beaglenthreads=1,
+      beaglewindow=40,
+      beagleoverlap=4,
+      javajre=JAVAJRE,
+            nthreads=NTHREADS,
+            platform_gamma=PLATFORM_GAMMA,
+            phasing_gamma=PHASING_GAMMA,
+            segmentation_gamma=SEGMENTATION_GAMMA,
+            segmentation_kmin=SEGMENTATIIN_KMIN,
+            phasing_kmin=PHASING_KMIN,
+            clonality_dist_metric=CLONALITY_DIST_METRIC,
+            ascat_dist_metric=ASCAT_DIST_METRIC,
+            min_ploidy=MIN_PLOIDY,
+            max_ploidy=MAX_PLOIDY,
+            min_rho=MIN_RHO,
+            min_goodness=MIN_GOODNESS_OF_FIT,
+            uninformative_BAF_threshold=BALANCED_THRESHOLD,
+            min_normal_depth=MIN_NORMAL_DEPTH,
+            min_base_qual=MIN_BASE_QUAL,
+            min_map_qual=MIN_MAP_QUAL,
+            calc_seg_baf_option=CALC_SEG_BAF_OPTION,
+            skip_allele_counting=SKIP_ALLELECOUNTING,
+            skip_preprocessing=SKIP_PREPROCESSING,
+            skip_phasing=SKIP_PHASING,
+            prior_breakpoints_file=PRIOR_BREAKPOINTS_FILE,
+      GENOMEBUILD=GENOMEBUILD,
+      chrom_coord_file=CHROM_COORD_FILE)
 [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
[16] "16" "17" "18" "19" "20" "21" "22" "X" 
Reading locis
Reading locis
Reading locis
Reading locis
Reading locis
Reading locis
Reading locis
Reading locis
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Reading locis
Reading locis
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Reading locis
Reading locis
Done reading locis
Multi pos start:
Reading locis
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Reading locis
Reading locis
Reading locis
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Reading locis
Reading locis
Done reading locis
Multi pos start:
Reading locis
Reading locis
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Reading locis
Reading locis
Done reading locis
Multi pos start:
Reading locis
Reading locis
Done reading locis
Multi pos start:
Reading locis
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Reading locis
Done reading locis
Multi pos start:
Reading locis
Reading locis
Done reading locis
Multi pos start:
Reading locis
Reading locis
Done reading locis
Multi pos start:
Reading locis
Reading locis
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Reading locis
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Reading locis
Reading locis
Reading locis
Reading locis
Done reading locis
Multi pos start:
Reading locis
Reading locis
Reading locis
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Reading locis
Reading locis
Reading locis
Done reading locis
Multi pos start:
Reading locis
Reading locis
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Reading locis
Done reading locis
Multi pos start:
[1] "minCount=4"
dsampath31 commented 9 months ago

@lincj1994 Hi, any luck with this issue? my script is stuck in the same state as well for quite some time

sunnaa0423 commented 9 months ago

Me too! And all the output tab files are empty, is there someone who can help with that?

jcesar101 commented 8 months ago

Hi,

just to confirm, did the pipeline resume execution after some time or it didn't?

Could you please also specify what battenberg version, R version and parameters are you using when invoking the pipeline?

Where was the reference data, used in this executions, obtained from?

Kind regards and thank you.

dsampath31 commented 8 months ago

Hi,

The pipeline did not resume execution and was stuck in the same step for several hours until I manually killed the task.

I noticed that this happens when I’m using a large BAM (250GB) file as I was able to successfully run (150GB) sample. This would happen within an hour of running the job during the parsing step. I’m using v2.2.8 as I had that dockerized already for hg38.

On Sun, Jan 21, 2024 at 10:54 PM Julio Cesar Cortes < @.***> wrote:

Hi,

just to confirm, did the pipeline resume execution after some time or it didn't?

Could you please also specify what battenberg version, R version and parameters are you using when invoking the pipeline?

Kind regards and thank you.

— Reply to this email directly, view it on GitHub https://github.com/Wedge-lab/battenberg/issues/114#issuecomment-1903368378, or unsubscribe https://github.com/notifications/unsubscribe-auth/AJ6IZ6ENZ4FGQ4B2XB334WDYPYEJLAVCNFSM6AAAAAA6KUJ3L6VHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMYTSMBTGM3DQMZXHA . You are receiving this because you commented.Message ID: @.***>

sunnaa0423 commented 8 months ago

Hi,

I used Battenberg v2.2.10 and R v4.3.2. I tried with BAM data of around 7G and 44G, but the pipeline got stuck at the [1] "minCount=10". The waiting time exceeded 10 hours for the 7G data and over 5 days for the 44G data.

The output files include normal_alleleFrequencies_chrn.txt (1-23), tumor_alleleFrequencies_chrn.txt (1-23), tumor_mutantBAF.tab, tumor_mutantLogR.tab, tumor_normalBAF.tab, tumor_normalLogR.tab, tumor_alleleCounts.tab. These txt files seem normal, but all tab files only have column names. Here are the input parameters and the command executed:

nb="/input/normal_hg38_sort_rmdup.bam" 
tb="/input/tumor_hg38_sort_rmdup.bam" 
outdir="/output/" 
Rscript /battenberg_pipline_test/battenberg_wgs.R -t tumor -n normal --tb ${tb} --nb ${nb} --sex Male -o ${outdir}

The battenberg_wgs.R script in the attachment was downloaded from the inst/example directory on GitHub and has been modified to update the file paths. battenberg_wgs.json

Thanks for your help.

fswirsky commented 7 months ago

Hi,

I'm similarly running into the same issue - I'm running the battenber_wgs.R pipeline on a matched tumour and normal BAM, 131GB and 77GB respectively. I'm currently on a runtime of 5 days and there haven't been any changes for almost all that time, with the most recent output being:

Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Reading locis
Done reading locis
Multi pos start:
Reading locis
Done reading locis
Multi pos start:
Reading locis
Reading locis
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
Done reading locis
Multi pos start:
[1] "minCount=10"

And this was about ~1hr into running and hasn't changed since. I'm running the most recent version of Battenbery (v2.2.9) and R (v4.3.1). Similarly, the only outputted files - like sunnaa0423 - are: normal_alleleFrequencies_chrn.txt (1-23), tumor_alleleFrequencies_chrn.txt (1-23), tumor_mutantBAF.tab, tumor_mutantLogR.tab, tumor_normalBAF.tab, tumor_normalLogR.tab, tumor_alleleCounts.tab, with the allFrequencies.text files looking normal, but all .tab files have only column names. Any help is greatly appreciated.

jcesar101 commented 7 months ago

Hi,

thank you for providing additional details about this issue. Considering the step where the execution hangs and that the tab files are mostly empty, it looks like there is not enough memory allocated to the job when creating the BAF and LogR data frames.

Could you please confirm how much memory is currently being allocated and, if possible, try to increase the memory to see if that fixes the problem?

fswirsky commented 7 months ago

Hi,

Thank you for replying promptly. At the moment I've got it running on an HPC - it is running across 34 cores with 122GB per core; I'm not sure if it automatically distributes the job across cores so maybe that is the problem. The current job is using all of my recourse allowance for the HPC so I can kill the job and retry with ~1TB of memory on just 1 core instead.

a3schiller commented 7 months ago

Hi!

I have the same issue as @fswirsky. I'm using the latest version from the development branch. I'm running in "paired" mode with both tumor and normal BAM file around 100GB. I'm running on 28 cores (--cpu 28) with 8GB per core, however it seems like only 1 core is actually used.

Thanks in advance!

jcesar101 commented 7 months ago

Hi,

the default number of worker threads (if --cpu argument is not specified) is 8, thus, is not clear why it's only using one core. Could this be something associated with the resource allocation on the running environment? For instance, in a SGE HPC we have to specify the number of cores that should be allocated to the job (-pe [ smp | mpi | openmp ] <number_of_cores>) in addition to the --cpu parameter in the package.

a3schiller commented 7 months ago

Hi,

Thanks for your fast reply! I can see that 28 cores are allocated to Battenberg but that only 1 is running on 100% and the rest is running on 0%. So it seems like the cores that are allocated to Battenberg is not used properly.

Best, Alice

jcesar101 commented 7 months ago

The environment heavily affects the way paralelisation is performed (shared memory allocation, disk access, inter-process communication), but it's also worth noting that not all the steps in Battenberg can be parallelised (for instance when consolidating results at sample-level), and even for those steps that do have some degree of parallelisation, it may not always benefit from increasing the number of threads, but quite the opposite --e.g., due to overhead in thread synchronisation.

I would start with a relatively small number of threads and then gradually increase this number to see how the overall execution performs. In our environment, I have noticed a significant improvement when moving from 8 to 16 cores, but more than that has the opposite effect, and this may be different in other environments.

a3schiller commented 7 months ago

Hi again!

After a lot of troubleshooting I believe that the problem is in the function getAlleleCounts which gives both tumor and normal alleleFrequencies_chr.txt files, but all values (Count_A, Count_C, Count_G, Count_T, Good_depth) are 0 in these files - which I assume is wrong. Also, the files alleleCounts.tab, mutantBAF.tab, mutantLogR.tab, normalBAF.tab, and normalLogR.tab consist only of headers and no other data. This results in an empty SNPpos used as input to split_genome and I get stuck in the first while-loop.

Since this problem originally comes from getAlleleCounts, my guess is that there is some problem related to alleleCounter rather than Battenberg - but I have not come further than this in my troubleshooting. Still wanted to post it here if it can be of any help to you or if someone have an solution to this problem!

Best,

Alice

sunnaa0423 commented 7 months ago

Hi there!

I found that the issue with “minCount=10” mainly arises from the problematic function battenberg() -> prepare_wgs() -> getBAFsAndLogRs() ->concatenateG1000SnpFiles(). This function(concatenateG1000SnpFiles()) is located in https://github.com/Wedge-lab/battenberg/tree/master/R/util.R, but I couldn't locate it in the source files of the R package downloaded on the HPC. Also, redefining it doesn't override the internal calls. How should I fix this? I need help!

Thanks!

Below is the content of the function, with modifications made on line 6:chrom <- paste0("chr",chrom)

concatenateG1000SnpFiles<-function(inputStart, inputEnd, chr_names) {
  data = list()
  for(chrom in chr_names) {
    filename = paste(inputStart, chrom, inputEnd, sep="")
    if(file.exists(filename) && file.info(filename)$size>0) {
      chrom <- paste0("chr",chrom)    #”chr" prefix should be added, otherwise it won't intersect with objects in the file, resulting in getting stuck at "minCount=10”.
      data[[chrom]] = cbind(chromosome=chrom, read_table_generic(filename))
    }
  }
  return(as.data.frame(do.call(rbind, data)))
}
jcesar101 commented 6 months ago

Hi again!

After a lot of troubleshooting I believe that the problem is in the function getAlleleCounts which gives both tumor and normal alleleFrequencies_chr.txt files, but all values (Count_A, Count_C, Count_G, Count_T, Good_depth) are 0 in these files - which I assume is wrong. Also, the files alleleCounts.tab, mutantBAF.tab, mutantLogR.tab, normalBAF.tab, and normalLogR.tab consist only of headers and no other data. This results in an empty SNPpos used as input to split_genome and I get stuck in the first while-loop.

Since this problem originally comes from getAlleleCounts, my guess is that there is some problem related to alleleCounter rather than Battenberg - but I have not come further than this in my troubleshooting. Still wanted to post it here if it can be of any help to you or if someone have an solution to this problem!

Best,

Alice

Hi Alice,

apologies for not replying earlier. That could very well be a problem with the alleleCounter, either to execute the command itself (e.g., executable location/privileges or environment variables) or something during the execution (e.g., resources, dependencies).

You can try executing the following command to test this tool outside of the Battenberg pipeline:

alleleCounter -b <bam_file> -l <single_chromosome_reference_file> -o <output_file> -m 20 -q 35

jcesar101 commented 6 months ago

Hi there!

I found that the issue with “minCount=10” mainly arises from the problematic function battenberg() -> prepare_wgs() -> getBAFsAndLogRs() ->concatenateG1000SnpFiles(). This function(concatenateG1000SnpFiles()) is located in https://github.com/Wedge-lab/battenberg/tree/master/R/util.R, but I couldn't locate it in the source files of the R package downloaded on the HPC. Also, redefining it doesn't override the internal calls. How should I fix this? I need help!

Thanks!

Below is the content of the function, with modifications made on line 6:chrom <- paste0("chr",chrom)

concatenateG1000SnpFiles<-function(inputStart, inputEnd, chr_names) {
  data = list()
  for(chrom in chr_names) {
    filename = paste(inputStart, chrom, inputEnd, sep="")
    if(file.exists(filename) && file.info(filename)$size>0) {
      chrom <- paste0("chr",chrom)    #”chr" prefix should be added, otherwise it won't intersect with objects in the file, resulting in getting stuck at "minCount=10”.
      data[[chrom]] = cbind(chromosome=chrom, read_table_generic(filename))
    }
  }
  return(as.data.frame(do.call(rbind, data)))
}

Hi!,

to prevent errors with or without the "chr" prefix, two set of reference files were provided in the following link. Please try this alternative before modifying the code.

sunnaa0423 commented 6 months ago

Hi there! I found that the issue with “minCount=10” mainly arises from the problematic function battenberg() -> prepare_wgs() -> getBAFsAndLogRs() ->concatenateG1000SnpFiles(). This function(concatenateG1000SnpFiles()) is located in https://github.com/Wedge-lab/battenberg/tree/master/R/util.R, but I couldn't locate it in the source files of the R package downloaded on the HPC. Also, redefining it doesn't override the internal calls. How should I fix this? I need help! Thanks! Below is the content of the function, with modifications made on line 6:chrom <- paste0("chr",chrom)

concatenateG1000SnpFiles<-function(inputStart, inputEnd, chr_names) {
  data = list()
  for(chrom in chr_names) {
    filename = paste(inputStart, chrom, inputEnd, sep="")
    if(file.exists(filename) && file.info(filename)$size>0) {
      chrom <- paste0("chr",chrom)    #”chr" prefix should be added, otherwise it won't intersect with objects in the file, resulting in getting stuck at "minCount=10”.
      data[[chrom]] = cbind(chromosome=chrom, read_table_generic(filename))
    }
  }
  return(as.data.frame(do.call(rbind, data)))
}

Hi!,

to prevent errors with or without the "chr" prefix, two set of reference files were provided in the following link. Please try this alternative before modifying the code.

Hi

Firstly, I did download the reference files from this link, where under the 1000G_loci_hg38 folder, there are only these three types of files as follows.

Pasted Graphic 2

I'm not sure if the two types of files are you mentioned two set of reference files , but both of them contain "chr". There's only one file named XXX_allele_indexXXX, which is read by the concatenateG1000SnpFiles() function. However, this file does not contain a column with "chr". image

getBAFsAndLogRs <- function (tumourAlleleCountsFile.prefix, normalAlleleCountsFile.prefix, 
    figuresFile.prefix, BAFnormalFile, BAFmutantFile, logRnormalFile, 
    logRmutantFile, combinedAlleleCountsFile, chr_names, g1000file.prefix, 
    minCounts = NA, samplename = "sample1", seed = as.integer(Sys.time())) 
{
    set.seed(seed)
    input_data = concatenateAlleleCountFiles(tumourAlleleCountsFile.prefix, 
        ".txt", chr_names)
    normal_input_data = concatenateAlleleCountFiles(normalAlleleCountsFile.prefix,".txt", chr_names)

    allele_data = concatenateG1000SnpFiles(g1000file.prefix,".txt", chr_names)

    chrpos_allele = paste(allele_data[, 1], "_", allele_data[, 
        2], sep = "")
    chrpos_normal = paste(normal_input_data[, 1], "_", normal_input_data[, 
        2], sep = "")
    chrpos_tumour = paste(input_data[, 1], "_", input_data[, 
        2], sep = "")
    matched_data = Reduce(intersect, list(chrpos_allele, chrpos_normal, 
        chrpos_tumour))
    allele_data = allele_data[chrpos_allele %in% matched_data, 
        ]
    normal_input_data = normal_input_data[chrpos_tumour %in% 
        matched_data, ]
    input_data = input_data[chrpos_tumour %in% matched_data, 
    ..............
    ..............
        ]

After the function concatenateG1000SnpFiles() reads it, the chromosome column in the “allele_data” variable obtained does not contain "chr".

head(allele_data) image

Therefore, it is not possible to intersect with other variable. I don't know how to solve this. Please give me some detailed solutions. Thank you.

jcesar101 commented 6 months ago

Hi,

the zip file downloaded from ORA includes a file named 1000G_loci_hg38.zip with files named 1kg.phase3.v5a_GRCh38nounref_loci_chr_N_.txt with these contents:

1   16103
1   51479
1   51898
1   51928
1   54490
...

And it also includes a file named 1000G_loci_hg38_chr.zip that contains the same set of files 1kg.phase3.v5a_GRCh38nounref_loci_chr_N_.txt, but with these contents:

chr1    16103
chr1    51479
chr1    51898
chr1    51928
chr1    54490
...

Therefore, in this case, try using the zip files (extracted from the downloaded large zip file) ending with "_chr.zip" as they should be already edited with the chr prefix in the chromosome names.