Closed msahraeian closed 3 years ago
Hi, here's the forest training config for the latest germline forest. All of the raw data is from public sources.
truths:
GRCh37.HG001:
vcf: /data/GIAB/truth/GRCh37/HG001/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz
bed: /data/GIAB/truth/GRCh37/HG001/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel.bed
hg19.HG001:
vcf: /data/GIAB/truth/GRCh37/HG001/HG001_hg19_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz
bed: /data/GIAB/truth/GRCh37/HG001/HG001_hg19_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel.bed
GRCh37.HG001.nexterarapidcapture_expandedexome:
vcf: /data/GIAB/truth/GRCh37/HG001/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz
bed: /rescomp/nexterarapidcapture_expandedexome_targetedregions.GIAB_highconfg.isec.bed
GRCh38.HG001:
vcf: /data/GIAB/truth/GRCh38/HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz
bed: /data/GIAB/truth/GRCh38/HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed
GRCh37.HG002:
vcf: /data/GIAB/truth/GRCh37/HG002/HG002_GRCh37_1_22_v4.1_draft_benchmark.vcf.gz
bed: /data/GIAB/truth/GRCh37/HG002/HG002_GRCh37_1_22_v4.1_draft_benchmark.bed
GRCh38.HG002:
vcf: /data/GIAB/truth/GRCh38/HG002/HG002_GRCh38_1_22_v4.2_benchmark.vcf.gz
bed: /data/GIAB/truth/GRCh38/HG002/HG002_GRCh38_1_22_v4.2_benchmark.bed
GRCh38.HG003:
vcf: /data/GIAB/truth/GRCh38/HG003/HG003_GRCh38_1_22_v4.2_benchmark.vcf.gz
bed: /data/GIAB/truth/GRCh38/HG003/HG003_GRCh38_1_22_v4.2_benchmark.bed
GRCh38.HG004:
vcf: /data/GIAB/truth/GRCh38/HG004/HG004_GRCh38_1_22_v4.2_benchmark.vcf.gz
bed: /data/GIAB/truth/GRCh38/HG004/HG004_GRCh38_1_22_v4.2_benchmark.bed
GRCh37.HG005:
vcf: /data/GIAB/truth/GRCh37/HG005/HG005_GRCh37_highconf_CG-IllFB-IllGATKHC-Ion-SOLID_CHROM1-22_v.3.3.2_highconf.vcf.gz
bed: /data/GIAB/truth/GRCh37/HG005/HG005_GRCh37_highconf_CG-IllFB-IllGATKHC-Ion-SOLID_CHROM1-22_v.3.3.2_highconf_noMetaSV.bed
GRCh38.HG005:
vcf: /data/GIAB/truth/GRCh38/HG005/HG005_GRCh38_GIAB_highconf_CG-Illfb-IllsentieonHC-Ion-10XsentieonHC-SOLIDgatkHC_CHROM1-22_v.3.3.2_highconf.vcf.gz
bed: /data/GIAB/truth/GRCh38/HG005/HG005_GRCh38_GIAB_highconf_CG-Illfb-IllsentieonHC-Ion-10XsentieonHC-SOLIDgatkHC_CHROM1-22_v.3.3.2_highconf.bed
GRCh38.HG006:
vcf: /data/GIAB/truth/GRCh38/HG006/HG006_GIAB_GRCh38_highconf_CG-IllFB-IllSNT-10X_CHROM1-22_v.3.3.2_highconf.vcf.gz
bed: /data/GIAB/truth/GRCh38/HG006/HG006_GIAB_GRCh38_highconf_CG-IllFB-IllSNT-10X_CHROM1-22_v.3.3.2_highconf_noinconsistent.bed
GRCh38.HG007:
vcf: /data/GIAB/truth/GRCh38/HG007/HG007_GIAB_GRCh38_highconf_CG-IllFB-IllSNT-10X_CHROM1-22_v.3.3.2_highconf.vcf.gz
bed: /data/GIAB/truth/GRCh38/HG007/HG007_GIAB_GRCh38_highconf_CG-IllFB-IllSNT-10X_CHROM1-22_v.3.3.2_highconf_noinconsistent.bed
GRCh37.SynDip:
vcf: /data/CHM/truth/full.37m.vcf.gz
bed: /data/CHM/truth/full.37m.bed.gz
GRCh38.SynDip:
vcf: /data/CHM/truth/full.38.vcf.gz
bed: /data/CHM/truth/full.38.bed.gz
examples:
-
reference: hs38DH.fa
reads: /data/GIAB/bam/HG002.GRCh38.60x.bam
regions: hs38DH.chr1-M.bed
truth: GRCh38.HG002
options: --ignore-unmapped-contigs
-
reference: hs38DH.fa
reads: /data/GIAB/bam/HG003.GRCh38.60x.bam
regions: hs38DH.chr1-M.bed
truth: GRCh38.HG003
options: --ignore-unmapped-contigs
-
reference: hs38DH.fa
reads: /data/GIAB/bam/HG004.GRCh38.60x.bam
regions: hs38DH.chr1-M.bed
truth: GRCh38.HG004
options: --ignore-unmapped-contigs
-
reference: /data/references/hs38DH.fa
reads:
- /data/GIAB/bam/HG002.GRCh38.60x.bam
- /data/GIAB/bam/HG003.GRCh38.60x.bam
- /data/GIAB/bam/HG004.GRCh38.60x.bam
regions: hs38DH.chr1-M.bed
truth:
HG002: GRCh38.HG002
HG003: GRCh38.HG003
HG004: GRCh38.HG004
options: -M HG004 -F HG003 --ignore-unmapped-contigs
-
reference: hs38DH.fa
reads: /data/pFDA/bam/HG002.NovaSeq.30x.hs38DH.bwa.bam
regions: hs38DH.chr1-M.bed
truth: GRCh38.HG002
-
reference: hs38DH.fa
reads: /data/pFDA/bam/mapped/HG003.NovaSeq.20x.hs38DH.bwa.bam
regions: hs38DH.chr1-M.bed
truth: GRCh38.HG003
-
reference: hs38DH.fa
reads: /data/pFDA/bam/mapped/HG004.NovaSeq.10x.hs38DH.bwa.bam
regions: hs38DH.chr1-M.bed
truth: GRCh38.HG004
-
reference: /data/references/hs38DH.fa
reads:
- /data/pFDA/bam/mapped/HG002.NovaSeq.30x.hs38DH.bwa.bam
- /data/pFDA/bam/mapped/HG003.NovaSeq.30x.hs38DH.bwa.bam
- /data/pFDA/bam/mapped/HG004.NovaSeq.30x.hs38DH.bwa.bam
regions: hs38DH.chr1-M.bed
truth:
HG002: GRCh38.HG002
HG003: GRCh38.HG003
HG004: GRCh38.HG004
options: -M HG004 -F HG003
-
reference: /data/references/hs38DH.fa
reads:
- /data/pFDA/bam/mapped/HG002.NovaSeq.20x.hs38DH.bwa.bam
- /data/pFDA/bam/mapped/HG003.NovaSeq.20x.hs38DH.bwa.bam
- /data/pFDA/bam/mapped/HG004.NovaSeq.20x.hs38DH.bwa.bam
regions: hs38DH.chr1-M.bed
truth:
HG002: GRCh38.HG002
HG003: GRCh38.HG003
HG004: GRCh38.HG004
options: -M HG004 -F HG003
-
reference: /data/references/hs38DH.fa
reads:
- /data/pFDA/bam/mapped/HG002.NovaSeq.10x.hs38DH.bwa.bam
- /data/pFDA/bam/mapped/HG003.NovaSeq.10x.hs38DH.bwa.bam
- /data/pFDA/bam/mapped/HG004.NovaSeq.10x.hs38DH.bwa.bam
regions: hs38DH.chr1-M.bed
truth:
HG002: GRCh38.HG002
HG003: GRCh38.HG003
HG004: GRCh38.HG004
options: -M HG004 -F HG003
-
reference: /data/references/hs38DH.fa
reads:
- /data/pFDA/bam/mapped/HG002.NovaSeq.10x.hs38DH.bwa.bam
- /data/pFDA/bam/mapped/HG003.NovaSeq.30x.hs38DH.bwa.bam
- /data/pFDA/bam/mapped/HG004.NovaSeq.30x.hs38DH.bwa.bam
regions: hs38DH.chr1-M.bed
truth:
HG002: GRCh38.HG002
HG003: GRCh38.HG003
HG004: GRCh38.HG004
options: -M HG004 -F HG003
-
reference: /data/references/hs38DH.fa
reads:
- /data/pFDA/bam/mapped/HG002.NovaSeq.30x.hs38DH.bwa.bam
- /data/pFDA/bam/mapped/HG003.NovaSeq.10x.hs38DH.bwa.bam
- /data/pFDA/bam/mapped/HG004.NovaSeq.10x.hs38DH.bwa.bam
regions: hs38DH.chr1-M.bed
truth:
HG002: GRCh38.HG002
HG003: GRCh38.HG003
HG004: GRCh38.HG004
options: -M HG004 -F HG003
-
reference: /data/references/hs38DH.fa
reads:
- /data/GIAB/bam/NA24631.BGISEQ500.GIAB.CL100076244.bwa.hs38DH.bam
- /data/GIAB/bam/NA24694.BGISEQ500.GIAB.CL100076304.bwa.hs38DH.bam
- /data/GIAB/bam/NA24695.BGISEQ500.GIAB.CL100076244.bwa.hs38DH.bam
regions: hs38DH.chr1-M.bed
truth:
NA24631: GRCh38.HG005
NA24694: GRCh38.HG006
NA24695: GRCh38.HG007
options: -M NA24695 -F NA24694
-
reference: /data/references/hs38DH.fa
reads: /data/GIAB/bam/NA24631.BGISEQ500.GIAB.CL100076244.bwa.hs38DH.bam
regions: hs38DH.chr1-M.bed
truth: GRCh38.HG005
-
reference: hs37d5.fa
reads: /data/1000G/bam/NA12878.mapped.ILLUMINA.bwa.CEU.high_coverage_pcr_free.20130906.bam
regions: hs37d5.1-MT.bed
truth: GRCh37.HG001
-
reference: hs37d5.fa
reads: /data/1000G/bam/NA12878.mapped.ILLUMINA.bwa.CEU.low_coverage.20121211.bam
regions: hs37d5.1-MT.bed
truth: GRCh37.HG001
-
reference: hs37d5.fa
reads: /data/BGISEQ-500/bam/NA12878.PE150-2.BGISEQ-500.bwa-mem.b37.bam
regions: hs37d5.1-MT.bed
truth: GRCh37.HG001
options: --sequence-error-model=PCR
-
reference: hs38DH.fa
reads: /data/bs/project/116384274/NA12878-PCRF450-1_ds.90038050736149d285f6965bb5dffba4/NA12878-PCRF450-1.bam
regions: hs38DH.chr1-M.bed
truth: GRCh38.HG001
options: --ignore-unmapped-contigs
-
reference: hs38DH.fa
reads: /data/bs/project/116384274/NA12878-PCRF450-2_ds.bc83ce4ddeec45809bd344ddd3dbda7e/NA12878-PCRF450-2.bam
regions: hs38DH.chr1-M.bed
truth: GRCh38.HG001
options: --ignore-unmapped-contigs
-
reference: hs38DH.fa
reads: /data/novaseq/bam/NA12878.PE150.NovaSeq.bwa.hs38DH.bam
regions: hs38DH.chr1-M.bed
truth: GRCh38.HG001
options: --sequence-error-model=PCR
-
reference: hs37d5.fa
reads: /data/platinum/bam/NA12878.platinum.bwa-mem.b37.bam
regions: hs37d5.1-MT.bed
truth: GRCh37.HG001
-
reference: hs37d5.fa
reads: /data/garvan/bam/NA12878.xTen.bwa-mem.b37.bam
regions: hs37d5.1-MT.bed
truth: GRCh37.HG001
options: --sequence-error-model=PCR
-
reference: hs37d5.fa
reads: /data/pFDA/bam/NA12878.precisionFDA.novoalign.hs37d5.bam
regions: hs37d5.1-MT.bed
truth: GRCh37.HG001
-
reference: hs37d5.fa
reads: /data/GIAB/bam/HG002_Sample_2D2_2F1_2F2_2L1_2L2.bam
regions: hs37d5.1-MT.bed
truth: GRCh37.HG002
-
reference: hs37d5.fa
reads: /data/GIAB/bam/HG005.GIAB.b37.novoalign.bam
regions: hs37d5.1-MT.bed
truth: GRCh37.HG005
-
reference: hs37d5.fa
reads: /data/CHM/bam/CHM1_CHM13_2.bam
regions: hs37d5.1-MT.bed
truth: GRCh37.SynDip
options: --ignore-unmapped-contigs
-
reference: hs38DH.fa
reads: /data/CHM/bam/CHM1_CHM13.Broad_3.XTen.bwa.hs38DH.bam
regions: hs38DH.chr1-M.bed
truth: GRCh38.SynDip
-
reference: /data/references/hs38DH.fa
reads:
- /data/GIAB/bam/HG002.GRCh38.60x.bam
- /data/CHM/bam/CHM1_CHM13.Broad_3.XTen.bwa.hs38DH.bam
regions: hs38DH.chr1-M.bed
truth:
HG002: GRCh38.HG002
CHM1_CHM13: GRCh38.SynDip
options: --ignore-unmapped-contigs
-
reference: hg19.fa
reads: /data/10X/bam/NA12878_WGS_v2_phased_possorted_bam.bam
regions: hg19.chr1-M.bed
truth: hg19.HG001
options: --sequence-error-model=PCR --ignore-unmapped-contigs
-
reference: hs37d5.fa
reads: /data/GIAB/bam/NA12878.garvan.exome.NIST7035.trimmed.bwa.hs37d5.bam
regions: hs37d5.1-MT.bed
truth: GRCh37.HG001.nexterarapidcapture_expandedexome
options: --sequence-error-model=PCR
-
reference: hs37d5.fa
reads:
- /data/1000G/bam/HG02058.mapped.ILLUMINA.bwa.KHV.low_coverage.20120522.bam
- /data/1000G/bam/NA12878.mapped.ILLUMINA.bwa.CEU.low_coverage.20121211.bam
regions: hs37d5.1-MT.bed
truth:
NA12878: GRCh37.HG001
-
reference: hs37d5.fa
reads:
- /data/syntumour/bam/NA12878.NORMAL.30x.bwa-mem.b37.bam
- /data/syntumour/bam/NA12878.BRCA.60x.bwa-mem.b37.bam
regions: hs37d5.1-MT.bed
truth:
NA12878.NORMAL: GRCh37.HG001
options: --normal-sample=NA12878.NORMAL
training:
training_fraction: 0.25
hyperparameters:
-
trees: 200
min_node_size: 20
Note that only 25% of examples (training_fraction: 0.25
) is actually used to train the forest. Let me know if you'd like more details.
Hi,
I was wondering what are the samples use for training the random forest model (germline) in version 0.7.4.
Thanks