luntergroup / octopus

Bayesian haplotype-based mutation calling
MIT License
301 stars 37 forks source link

training data used for the random forest model #179

Closed msahraeian closed 3 years ago

msahraeian commented 3 years ago

Hi,

I was wondering what are the samples use for training the random forest model (germline) in version 0.7.4.

Thanks

dancooke commented 3 years ago

Hi, here's the forest training config for the latest germline forest. All of the raw data is from public sources.

truths:
    GRCh37.HG001:
        vcf: /data/GIAB/truth/GRCh37/HG001/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz
        bed: /data/GIAB/truth/GRCh37/HG001/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel.bed
    hg19.HG001:
        vcf: /data/GIAB/truth/GRCh37/HG001/HG001_hg19_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz
        bed: /data/GIAB/truth/GRCh37/HG001/HG001_hg19_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel.bed
    GRCh37.HG001.nexterarapidcapture_expandedexome:
        vcf: /data/GIAB/truth/GRCh37/HG001/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz
        bed: /rescomp/nexterarapidcapture_expandedexome_targetedregions.GIAB_highconfg.isec.bed
    GRCh38.HG001:
        vcf: /data/GIAB/truth/GRCh38/HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz
        bed: /data/GIAB/truth/GRCh38/HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed
    GRCh37.HG002:
        vcf: /data/GIAB/truth/GRCh37/HG002/HG002_GRCh37_1_22_v4.1_draft_benchmark.vcf.gz
        bed: /data/GIAB/truth/GRCh37/HG002/HG002_GRCh37_1_22_v4.1_draft_benchmark.bed
    GRCh38.HG002:
        vcf: /data/GIAB/truth/GRCh38/HG002/HG002_GRCh38_1_22_v4.2_benchmark.vcf.gz
        bed: /data/GIAB/truth/GRCh38/HG002/HG002_GRCh38_1_22_v4.2_benchmark.bed
    GRCh38.HG003:
        vcf: /data/GIAB/truth/GRCh38/HG003/HG003_GRCh38_1_22_v4.2_benchmark.vcf.gz
        bed: /data/GIAB/truth/GRCh38/HG003/HG003_GRCh38_1_22_v4.2_benchmark.bed
    GRCh38.HG004:
        vcf: /data/GIAB/truth/GRCh38/HG004/HG004_GRCh38_1_22_v4.2_benchmark.vcf.gz
        bed: /data/GIAB/truth/GRCh38/HG004/HG004_GRCh38_1_22_v4.2_benchmark.bed
    GRCh37.HG005:
        vcf: /data/GIAB/truth/GRCh37/HG005/HG005_GRCh37_highconf_CG-IllFB-IllGATKHC-Ion-SOLID_CHROM1-22_v.3.3.2_highconf.vcf.gz
        bed: /data/GIAB/truth/GRCh37/HG005/HG005_GRCh37_highconf_CG-IllFB-IllGATKHC-Ion-SOLID_CHROM1-22_v.3.3.2_highconf_noMetaSV.bed
    GRCh38.HG005:
        vcf: /data/GIAB/truth/GRCh38/HG005/HG005_GRCh38_GIAB_highconf_CG-Illfb-IllsentieonHC-Ion-10XsentieonHC-SOLIDgatkHC_CHROM1-22_v.3.3.2_highconf.vcf.gz
        bed: /data/GIAB/truth/GRCh38/HG005/HG005_GRCh38_GIAB_highconf_CG-Illfb-IllsentieonHC-Ion-10XsentieonHC-SOLIDgatkHC_CHROM1-22_v.3.3.2_highconf.bed
    GRCh38.HG006:
        vcf: /data/GIAB/truth/GRCh38/HG006/HG006_GIAB_GRCh38_highconf_CG-IllFB-IllSNT-10X_CHROM1-22_v.3.3.2_highconf.vcf.gz
        bed: /data/GIAB/truth/GRCh38/HG006/HG006_GIAB_GRCh38_highconf_CG-IllFB-IllSNT-10X_CHROM1-22_v.3.3.2_highconf_noinconsistent.bed
    GRCh38.HG007:
        vcf: /data/GIAB/truth/GRCh38/HG007/HG007_GIAB_GRCh38_highconf_CG-IllFB-IllSNT-10X_CHROM1-22_v.3.3.2_highconf.vcf.gz
        bed: /data/GIAB/truth/GRCh38/HG007/HG007_GIAB_GRCh38_highconf_CG-IllFB-IllSNT-10X_CHROM1-22_v.3.3.2_highconf_noinconsistent.bed
    GRCh37.SynDip:
        vcf: /data/CHM/truth/full.37m.vcf.gz
        bed: /data/CHM/truth/full.37m.bed.gz
    GRCh38.SynDip:
        vcf: /data/CHM/truth/full.38.vcf.gz
        bed: /data/CHM/truth/full.38.bed.gz
examples:
    -
        reference: hs38DH.fa
        reads: /data/GIAB/bam/HG002.GRCh38.60x.bam
        regions: hs38DH.chr1-M.bed
        truth: GRCh38.HG002
        options: --ignore-unmapped-contigs
    -
        reference: hs38DH.fa
        reads: /data/GIAB/bam/HG003.GRCh38.60x.bam
        regions: hs38DH.chr1-M.bed
        truth: GRCh38.HG003
        options: --ignore-unmapped-contigs
    -
        reference: hs38DH.fa
        reads: /data/GIAB/bam/HG004.GRCh38.60x.bam
        regions: hs38DH.chr1-M.bed
        truth: GRCh38.HG004
        options: --ignore-unmapped-contigs
    -
      reference: /data/references/hs38DH.fa
      reads:
          - /data/GIAB/bam/HG002.GRCh38.60x.bam
          - /data/GIAB/bam/HG003.GRCh38.60x.bam
          - /data/GIAB/bam/HG004.GRCh38.60x.bam
      regions: hs38DH.chr1-M.bed
      truth:
          HG002: GRCh38.HG002
          HG003: GRCh38.HG003
          HG004: GRCh38.HG004
      options: -M HG004 -F HG003 --ignore-unmapped-contigs
    -
        reference: hs38DH.fa
        reads: /data/pFDA/bam/HG002.NovaSeq.30x.hs38DH.bwa.bam
        regions: hs38DH.chr1-M.bed
        truth: GRCh38.HG002
    -
        reference: hs38DH.fa
        reads: /data/pFDA/bam/mapped/HG003.NovaSeq.20x.hs38DH.bwa.bam
        regions: hs38DH.chr1-M.bed
        truth: GRCh38.HG003
    -
        reference: hs38DH.fa
        reads: /data/pFDA/bam/mapped/HG004.NovaSeq.10x.hs38DH.bwa.bam
        regions: hs38DH.chr1-M.bed
        truth: GRCh38.HG004
    -
      reference: /data/references/hs38DH.fa
      reads:
          - /data/pFDA/bam/mapped/HG002.NovaSeq.30x.hs38DH.bwa.bam
          - /data/pFDA/bam/mapped/HG003.NovaSeq.30x.hs38DH.bwa.bam
          - /data/pFDA/bam/mapped/HG004.NovaSeq.30x.hs38DH.bwa.bam
      regions: hs38DH.chr1-M.bed
      truth:
          HG002: GRCh38.HG002
          HG003: GRCh38.HG003
          HG004: GRCh38.HG004
      options: -M HG004 -F HG003
    -
      reference: /data/references/hs38DH.fa
      reads:
          - /data/pFDA/bam/mapped/HG002.NovaSeq.20x.hs38DH.bwa.bam
          - /data/pFDA/bam/mapped/HG003.NovaSeq.20x.hs38DH.bwa.bam
          - /data/pFDA/bam/mapped/HG004.NovaSeq.20x.hs38DH.bwa.bam
      regions: hs38DH.chr1-M.bed
      truth:
          HG002: GRCh38.HG002
          HG003: GRCh38.HG003
          HG004: GRCh38.HG004
      options: -M HG004 -F HG003
    -
      reference: /data/references/hs38DH.fa
      reads:
          - /data/pFDA/bam/mapped/HG002.NovaSeq.10x.hs38DH.bwa.bam
          - /data/pFDA/bam/mapped/HG003.NovaSeq.10x.hs38DH.bwa.bam
          - /data/pFDA/bam/mapped/HG004.NovaSeq.10x.hs38DH.bwa.bam
      regions: hs38DH.chr1-M.bed
      truth:
          HG002: GRCh38.HG002
          HG003: GRCh38.HG003
          HG004: GRCh38.HG004
      options: -M HG004 -F HG003
    -
      reference: /data/references/hs38DH.fa
      reads:
          - /data/pFDA/bam/mapped/HG002.NovaSeq.10x.hs38DH.bwa.bam
          - /data/pFDA/bam/mapped/HG003.NovaSeq.30x.hs38DH.bwa.bam
          - /data/pFDA/bam/mapped/HG004.NovaSeq.30x.hs38DH.bwa.bam
      regions: hs38DH.chr1-M.bed
      truth:
          HG002: GRCh38.HG002
          HG003: GRCh38.HG003
          HG004: GRCh38.HG004
      options: -M HG004 -F HG003
    -
      reference: /data/references/hs38DH.fa
      reads:
          - /data/pFDA/bam/mapped/HG002.NovaSeq.30x.hs38DH.bwa.bam
          - /data/pFDA/bam/mapped/HG003.NovaSeq.10x.hs38DH.bwa.bam
          - /data/pFDA/bam/mapped/HG004.NovaSeq.10x.hs38DH.bwa.bam
      regions: hs38DH.chr1-M.bed
      truth:
          HG002: GRCh38.HG002
          HG003: GRCh38.HG003
          HG004: GRCh38.HG004
      options: -M HG004 -F HG003
    -
      reference: /data/references/hs38DH.fa
      reads:
          - /data/GIAB/bam/NA24631.BGISEQ500.GIAB.CL100076244.bwa.hs38DH.bam
          - /data/GIAB/bam/NA24694.BGISEQ500.GIAB.CL100076304.bwa.hs38DH.bam
          - /data/GIAB/bam/NA24695.BGISEQ500.GIAB.CL100076244.bwa.hs38DH.bam
      regions: hs38DH.chr1-M.bed
      truth:
          NA24631: GRCh38.HG005
          NA24694: GRCh38.HG006
          NA24695: GRCh38.HG007
      options: -M NA24695 -F NA24694
    -
        reference: /data/references/hs38DH.fa
        reads: /data/GIAB/bam/NA24631.BGISEQ500.GIAB.CL100076244.bwa.hs38DH.bam
        regions: hs38DH.chr1-M.bed
        truth: GRCh38.HG005
    -
        reference: hs37d5.fa
        reads: /data/1000G/bam/NA12878.mapped.ILLUMINA.bwa.CEU.high_coverage_pcr_free.20130906.bam
        regions: hs37d5.1-MT.bed
        truth: GRCh37.HG001
    -
        reference: hs37d5.fa
        reads: /data/1000G/bam/NA12878.mapped.ILLUMINA.bwa.CEU.low_coverage.20121211.bam
        regions: hs37d5.1-MT.bed
        truth: GRCh37.HG001
    -
        reference: hs37d5.fa
        reads: /data/BGISEQ-500/bam/NA12878.PE150-2.BGISEQ-500.bwa-mem.b37.bam
        regions: hs37d5.1-MT.bed
        truth: GRCh37.HG001
        options: --sequence-error-model=PCR
    -
        reference: hs38DH.fa
        reads: /data/bs/project/116384274/NA12878-PCRF450-1_ds.90038050736149d285f6965bb5dffba4/NA12878-PCRF450-1.bam
        regions: hs38DH.chr1-M.bed
        truth: GRCh38.HG001
        options: --ignore-unmapped-contigs
    -
        reference: hs38DH.fa
        reads: /data/bs/project/116384274/NA12878-PCRF450-2_ds.bc83ce4ddeec45809bd344ddd3dbda7e/NA12878-PCRF450-2.bam
        regions: hs38DH.chr1-M.bed
        truth: GRCh38.HG001
        options: --ignore-unmapped-contigs
    -
        reference: hs38DH.fa
        reads: /data/novaseq/bam/NA12878.PE150.NovaSeq.bwa.hs38DH.bam
        regions: hs38DH.chr1-M.bed
        truth: GRCh38.HG001
        options: --sequence-error-model=PCR
    -
        reference: hs37d5.fa
        reads: /data/platinum/bam/NA12878.platinum.bwa-mem.b37.bam
        regions: hs37d5.1-MT.bed
        truth: GRCh37.HG001
    -
        reference: hs37d5.fa
        reads: /data/garvan/bam/NA12878.xTen.bwa-mem.b37.bam
        regions: hs37d5.1-MT.bed
        truth: GRCh37.HG001
        options: --sequence-error-model=PCR
    -
        reference: hs37d5.fa
        reads: /data/pFDA/bam/NA12878.precisionFDA.novoalign.hs37d5.bam
        regions: hs37d5.1-MT.bed
        truth: GRCh37.HG001
    -
        reference: hs37d5.fa
        reads: /data/GIAB/bam/HG002_Sample_2D2_2F1_2F2_2L1_2L2.bam
        regions: hs37d5.1-MT.bed
        truth: GRCh37.HG002
    -
        reference: hs37d5.fa
        reads: /data/GIAB/bam/HG005.GIAB.b37.novoalign.bam
        regions: hs37d5.1-MT.bed
        truth: GRCh37.HG005
    -
        reference: hs37d5.fa
        reads: /data/CHM/bam/CHM1_CHM13_2.bam
        regions: hs37d5.1-MT.bed
        truth: GRCh37.SynDip
        options: --ignore-unmapped-contigs
    -
        reference: hs38DH.fa
        reads: /data/CHM/bam/CHM1_CHM13.Broad_3.XTen.bwa.hs38DH.bam
        regions: hs38DH.chr1-M.bed
        truth: GRCh38.SynDip
    -
      reference: /data/references/hs38DH.fa
      reads: 
          - /data/GIAB/bam/HG002.GRCh38.60x.bam
          - /data/CHM/bam/CHM1_CHM13.Broad_3.XTen.bwa.hs38DH.bam
      regions: hs38DH.chr1-M.bed
      truth:
          HG002: GRCh38.HG002
          CHM1_CHM13: GRCh38.SynDip
      options: --ignore-unmapped-contigs
    -
        reference: hg19.fa
        reads: /data/10X/bam/NA12878_WGS_v2_phased_possorted_bam.bam
        regions: hg19.chr1-M.bed
        truth: hg19.HG001
        options: --sequence-error-model=PCR --ignore-unmapped-contigs
    -
        reference: hs37d5.fa
        reads: /data/GIAB/bam/NA12878.garvan.exome.NIST7035.trimmed.bwa.hs37d5.bam
        regions: hs37d5.1-MT.bed
        truth: GRCh37.HG001.nexterarapidcapture_expandedexome
        options: --sequence-error-model=PCR
    -
        reference: hs37d5.fa
        reads:
            - /data/1000G/bam/HG02058.mapped.ILLUMINA.bwa.KHV.low_coverage.20120522.bam
            - /data/1000G/bam/NA12878.mapped.ILLUMINA.bwa.CEU.low_coverage.20121211.bam
        regions: hs37d5.1-MT.bed
        truth:
            NA12878: GRCh37.HG001
    -
        reference: hs37d5.fa
        reads:
            - /data/syntumour/bam/NA12878.NORMAL.30x.bwa-mem.b37.bam
            - /data/syntumour/bam/NA12878.BRCA.60x.bwa-mem.b37.bam
        regions: hs37d5.1-MT.bed
        truth:
            NA12878.NORMAL: GRCh37.HG001
        options: --normal-sample=NA12878.NORMAL
training:
    training_fraction: 0.25
    hyperparameters:
        -
            trees: 200
            min_node_size: 20

Note that only 25% of examples (training_fraction: 0.25) is actually used to train the forest. Let me know if you'd like more details.