Index out of bounds error for FilterMutectCalls

rlancaster96 commented 4 months ago

Expected behavior: With previous test files (small, 800-line sliced fastq), behaves as expected and FilterMutectCalls completes successfully. With large fastq files or full-size files, FilterMutectCalls exits with the error below:

Command run:

Workflow:

#!/usr/bin/env nextflow

// Create queue channels (consumable)
// will need to split up into tumor channel and normal channel, use regex for this
tumor_ch = Channel.fromPath("${params.bam_files}/*_T_*.bam")
normal_ch = Channel.fromPath("${params.bam_files}/*_G_*.bam")

// Create value channels (use first operator to convert to value from queue)
tumor_val = tumor_ch.first()
normal_val = normal_ch.first()

// Sample id channel: take full filename and grab the sample id by splitting by _ and taking the first item
sample_id = tumor_ch.map { filePath -> 
                def fileName = filePath.baseName
                def baseName = fileName.split('_')[0]
                return baseName}
sample_id_ch = sample_id.first()

// Define the list of chromosomes + create a channel emitting each chromosome
chromosomes = (1..22).collect { it.toString() } + ['X']
chrom_strings = Channel.from(chromosomes)
chrom_ch = chrom_strings.map { it -> "chr" + it }

include { GETPILEUPSUMMARIES } from '../../tools/gatk/get_pileup_summaries.nf'
include { CALCULATECONTAMINATION } from '../../tools/gatk/calculate_contamination.nf'
include { MUTECT2 } from '../../tools/gatk/mutect.nf'
include { BGZIP; PREPAREVCF } from '../../tools/bcftools/prepareVCFs.nf'
include { MERGESTATS } from '../../tools/bcftools/combineMutectStats.nf'
include { LEARNORIENTATION } from '../../tools/bcftools/combineF1R2files.nf'
include { FILTERMUTECT } from '../../tools/gatk/filter_mutect.nf'
include { ANNOTATE } from '../../tools/snpeff/annotate_variants.nf'

workflow {

    GETPILEUPSUMMARIES(tumor_ch, normal_ch, params.exac)
    tumor_table = GETPILEUPSUMMARIES.out.tumor
    normal_table = GETPILEUPSUMMARIES.out.normal

    CALCULATECONTAMINATION(tumor_table, normal_table)
    contam_table = CALCULATECONTAMINATION.out.contamination
    segment_table = CALCULATECONTAMINATION.out.segment

    // Run mutect2
    MUTECT2(tumor_val, normal_val, chrom_ch, sample_id_ch)

    // Merge and prepare VCF
    BGZIP(MUTECT2.out.vcf)
    vcfs_ch = BGZIP.out.vcf.collect()
    split_vcf_index = BGZIP.out.index.collect()
    PREPAREVCF(vcfs_ch, split_vcf_index, sample_id_ch)
    unfiltered_vcf = PREPAREVCF.out.normalized
    unfiltered_vcf_index = PREPAREVCF.out.index

    // Merge stats 
    stats = MUTECT2.out.stats
    stats_ch = stats.collect()
    MERGESTATS(stats_ch, sample_id_ch)
    filter_stats = MERGESTATS.out

    // Merge f1r2 read orientation files 
    f1r2files = MUTECT2.out.f1r2
    f1r2_ch = f1r2files.collect()
    LEARNORIENTATION(f1r2_ch, sample_id_ch)
    orientationmodel = LEARNORIENTATION.out

    // Filter mutect2 calls
    FILTERMUTECT(unfiltered_vcf, unfiltered_vcf_index, params.mutect_idx, params.mutect_idx_fai, params.mutect_idx_dict, filter_stats, orientationmodel, segment_table, contam_table, sample_id_ch)
    filter_vcf = FILTERMUTECT.out

    // Annotate with snpEff
    ANNOTATE(filter_vcf, sample_id_ch)
}

Tool:

#!/usr/bin/env nextflow

// filter Mutect2 calls with GATK
process FILTERMUTECT {
    // Set maximum memory
    memory "${params.mutect_memory}"

    container "${params.container_gatk}"

    publishDir "${params.outdir}/filtered", mode: 'copy'

    input:
    path unfiltered_vcf
    path unfiltered_vcf_index
    path mutect_idx
    path mutect_idx_fai
    path mutect_dict
    path vcf_stats
    path read_orientation_model
    path segmentation_table
    path contamination_table
    val sample_id

    output:
    path("${sample_id}_filtered.vcf")

    script:
    """
    gatk FilterMutectCalls \
    -R ${params.mutect_idx} \
    -V ${unfiltered_vcf} \
    --tumor-segmentation ${segmentation_table} \
    --contamination-table ${contamination_table} \
    --read-index ${mutect_idx_fai} \
    --sequence-dictionary ${mutect_dict} \
    --ob-priors ${read_orientation_model} \
    -O ${sample_id}_filtered.vcf \
    --stats ${vcf_stats}
    """
}

Error:

java.lang.IndexOutOfBoundsException: Index 1 out of bounds for length 1
    at java.base/jdk.internal.util.Preconditions.outOfBounds(Preconditions.java:64)
    at java.base/jdk.internal.util.Preconditions.outOfBoundsCheckIndex(Preconditions.java:70)
    at java.base/jdk.internal.util.Preconditions.checkIndex(Preconditions.java:266)
    at java.base/java.util.Objects.checkIndex(Objects.java:361)
    at java.base/java.util.ArrayList.get(ArrayList.java:427)
    at org.broadinstitute.hellbender.tools.walkers.mutect.filtering.StrandArtifactFilter.lambda$calculateArtifactProbabilities$6(StrandArtifactFilter.java:74)
    at java.base/java.util.stream.IntPipeline$1$1.accept(IntPipeline.java:180)
    at java.base/java.util.stream.Streams$RangeIntSpliterator.forEachRemaining(Streams.java:104)
    at java.base/java.util.Spliterator$OfInt.forEachRemaining(Spliterator.java:711)
    at java.base/java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:509)
    at java.base/java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:499)
    at java.base/java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:921)
    at java.base/java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)
    at java.base/java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:682)
    at org.broadinstitute.hellbender.tools.walkers.mutect.filtering.StrandArtifactFilter.calculateArtifactProbabilities(StrandArtifactFilter.java:80)
    at org.broadinstitute.hellbender.tools.walkers.mutect.filtering.StrandArtifactFilter.calculateErrorProbabilityForAlleles(StrandArtifactFilter.java:50)
    at org.broadinstitute.hellbender.tools.walkers.mutect.filtering.Mutect2AlleleFilter.errorProbabilities(Mutect2AlleleFilter.java:86)
    at org.broadinstitute.hellbender.tools.walkers.mutect.filtering.ErrorProbabilities.lambda$new$0(ErrorProbabilities.java:27)
    at java.base/java.util.stream.Collectors.lambda$toMap$68(Collectors.java:1674)
    at java.base/java.util.stream.ReduceOps$3ReducingSink.accept(ReduceOps.java:169)
    at java.base/java.util.ArrayList$ArrayListSpliterator.forEachRemaining(ArrayList.java:1625)
    at java.base/java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:509)
    at java.base/java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:499)
    at java.base/java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:921)
    at java.base/java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)
    at java.base/java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:682)
    at org.broadinstitute.hellbender.tools.walkers.mutect.filtering.ErrorProbabilities.<init>(ErrorProbabilities.java:25)
    at org.broadinstitute.hellbender.tools.walkers.mutect.filtering.Mutect2FilteringEngine.accumulateData(Mutect2FilteringEngine.java:138)
    at org.broadinstitute.hellbender.tools.walkers.mutect.filtering.FilterMutectCalls.nthPassApply(FilterMutectCalls.java:154)
    at org.broadinstitute.hellbender.engine.MultiplePassVariantWalker.lambda$traverse$0(MultiplePassVariantWalker.java:40)
    at org.broadinstitute.hellbender.engine.MultiplePassVariantWalker.lambda$traverseVariants$1(MultiplePassVariantWalker.java:77)
    at java.base/java.util.stream.ForEachOps$ForEachOp$OfRef.accept(ForEachOps.java:183)
    at java.base/java.util.stream.ReferencePipeline$2$1.accept(ReferencePipeline.java:179)
    at java.base/java.util.Iterator.forEachRemaining(Iterator.java:133)
    at java.base/java.util.Spliterators$IteratorSpliterator.forEachRemaining(Spliterators.java:1845)
    at java.base/java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:509)
    at java.base/java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:499)
    at java.base/java.util.stream.ForEachOps$ForEachOp.evaluateSequential(ForEachOps.java:150)
    at java.base/java.util.stream.ForEachOps$ForEachOp$OfRef.evaluateSequential(ForEachOps.java:173)
    at java.base/java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)
    at java.base/java.util.stream.ReferencePipeline.forEach(ReferencePipeline.java:596)
    at org.broadinstitute.hellbender.engine.MultiplePassVariantWalker.traverseVariants(MultiplePassVariantWalker.java:75)
    at org.broadinstitute.hellbender.engine.MultiplePassVariantWalker.traverse(MultiplePassVariantWalker.java:40)
    at org.broadinstitute.hellbender.engine.GATKTool.doWork(GATKTool.java:1098)
    at org.broadinstitute.hellbender.cmdline.CommandLineProgram.runTool(CommandLineProgram.java:149)
    at org.broadinstitute.hellbender.cmdline.CommandLineProgram.instanceMainPostParseArgs(CommandLineProgram.java:198)
    at org.broadinstitute.hellbender.cmdline.CommandLineProgram.instanceMain(CommandLineProgram.java:217)
    at org.broadinstitute.hellbender.Main.runCommandLineProgram(Main.java:160)
    at org.broadinstitute.hellbender.Main.mainEntry(Main.java:203)
    at org.broadinstitute.hellbender.Main.main(Main.java:289)

A similar but not identical issue was posted here for the related command Mutect2: https://github.com/broadinstitute/gatk/issues/4578#issuecomment-616557568

Attempts to fix the issue:

run with/without path mutect_idx_fai and path mutect_dict args
run with only 1 chromosome
run with create index argument set to false as per suggestion for similar haplotypecaller issue here https://gatk.broadinstitute.org/hc/en-us/community/posts/16527065134107-HaplotypeCaller-java-lang-ArrayIndexOutOfBoundsException-Index-32770-out-of-bounds-for-length-32770
remove ob priors argument

Environment: Nextflow version 24.04.2 singularity-ce version 3.8.0-1.el7 GATK 4.4.0.0 Branch: https://github.com/ohsu-cedar-comp-hub/WGS-nextflow-workflow/pull/58

rlancaster96 commented 4 months ago

Issue has been isolated to processing with bcftools norm. FilterMutectCalls works with files that are concatenated and sorted with bcftools prior to processing with bcftools norm, and fails after.

rlancaster96 commented 3 months ago

the broad institute tool last listed in the java stack trace is here: https://github.com/broadinstitute/gatk/blob/master/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/StrandArtifactFilter.java org.broadinstitute.hellbender.tools.walkers.mutect.filtering.StrandArtifactFilter.lambda$calculateArtifactProbabilities$6(StrandArtifactFilter.java:74)

the specific line is this:

return IntStream.range(0, altSBs.size()).mapToObj(i -> {
    final List<Integer> altSB = altSBs.get(i);
    final int altIndelSize = indelSizes.get(i); <<<<< line 74

this is the context:

public List<EStep> calculateArtifactProbabilities(final VariantContext vc, final Mutect2FilteringEngine filteringEngine) {
        // for each allele, forward and reverse count
        List<List<Integer>> sbs = StrandBiasUtils.getSBsForAlleles(vc);
        if (sbs == null || sbs.isEmpty() || sbs.size() <= 1) {
            return Collections.emptyList();
        }
        // remove symbolic alleles
        if (vc.hasSymbolicAlleles()) {
            sbs = GATKVariantContextUtils.removeDataForSymbolicAlleles(vc, sbs);
        }

        final List<Integer> indelSizes = vc.getAlternateAlleles().stream().map(alt -> Math.abs(vc.getReference().length() - alt.length())).collect(Collectors.toList());
        int totalFwd = sbs.stream().map(sb -> sb.get(0)).mapToInt(i -> i).sum();
        int totalRev = sbs.stream().map(sb -> sb.get(1)).mapToInt(i -> i).sum();
        // skip the reference
        final List<List<Integer>> altSBs = sbs.subList(1, sbs.size());

        return IntStream.range(0, altSBs.size()).mapToObj(i -> {
            final List<Integer> altSB = altSBs.get(i);
            final int altIndelSize = indelSizes.get(i);
            if (altSB.stream().mapToInt(Integer::intValue).sum() == 0 || altIndelSize > LONGEST_STRAND_ARTIFACT_INDEL_SIZE) {
                return new EStep(0, 0, totalFwd, totalRev, altSB.get(0), altSB.get(1));
            } else {
                return strandArtifactProbability(strandArtifactPrior, totalFwd, totalRev, altSB.get(0), altSB.get(1), altIndelSize);
            }
            }).collect(Collectors.toList());
    }

rlancaster96 commented 3 months ago

resolved with https://github.com/ohsu-cedar-comp-hub/WGS-nextflow-workflow/pull/58

the bcftools command split by multiallelic sites for indels and SNPs. This is advised against by GATK and is likely what caused the issue. https://github.com/broadinstitute/gatk/issues/5052](https://github.com/broadinstitute/gatk/issues/5052
invoking bcftools norm command does not change the structure of the file; likely indels are already left-aligned in the output of mutect2

ohsu-cedar-comp-hub / WGS-nextflow-workflow

Index out of bounds error for FilterMutectCalls #59