Not really an issue, just showing some performance metrics of hifiasm with a simulated diploid assembly of the T2T chr8 sequence, with simulated PacBio HiFi reads. Used default hifiasm settings except one more round of error correction and -l3 purging.
# get chr8 T2T sequence
wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/016/894/425/GCA_016894425.1_ASM1689442v1/GCA_016894425.1_ASM1689442v1_genomic.fna.gz
# convert masked bases to upper case
~/bin/seqtk/seqtk seq -U GCA_016894425.1_ASM1689442v1_genomic.fna.gz | \
gzip > GCA_016894425.1_ASM1689442v1_genomic.fna_upper.gz
# convert to diploid and add about 2% heterozygosity rate
~/bin/bbmap-38.90/mutate.sh \
in=GCA_016894425.1_ASM1689442v1_genomic.fna_upper.gz \
ow=t \
vcf=GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.vcf.gz \
out=GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.fasta.gz \
ploidy=2 \
subrate=0.0192 \
indelrate=0.001 \
maxindel=20 \
nohomopolymers=t \
hetrate=1 2> GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.fasta.log.txt
# genome size
gunzip -c GCA_016894425.1_ASM1689442v1_genomic.fna_upper.fasta.gz | \
grep -v ">"|wc -m
# 146259671
# 2% heterozygosity is how many bases
0.02\*146259671
# 2925193.42
# number of mutations added
gunzip -c GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.vcf.gz| \
grep -v "^#"|wc -l
# 2942229
# ~2% het rate
Originally wrote Q30-Q40 when actually the next command makes Q20-Q30 reads
# simulate 15x per haplotype coverage of PacBio HiFi reads (Q20-Q30, 9000-12000 bases)
~/bin/bbmap-38.90/randomreads.sh \
ow=t seed=1 \
ref=GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.fasta.gz \
illuminanames=t addslash=t pacbio=t pbmin=0.001 pbmax=0.01 \
coverage=15 paired=f gaussianlength=t minlength=9000 midlength=10000 \
maxlength=12000 out=hifi-30x.fasta.gz
# assemble with
# hifiasm 0.15.1-r331
~/bin/hifiasm-new/hifiasm -r 4 -l3 -t 32 -o hifi-30x.fasta hifi-30x.fasta.gz
# look at haplotype assembly stats with gfatools Version: 0.4-r214-dirty and BBStats.sh from BBMAP 38.90
~/bin/gfatools/gfatools gfa2fa hifi-30x.fasta.bp.hap1.p_ctg.gfa |~/bin/bbmap-38.90/bbstats.sh in=stdin
#A C G T N IUPAC Other GC GC_stdev
#0.2985 0.2017 0.2020 0.2978 0.0000 0.0000 0.0000 0.4037 0.0249
#Main genome scaffold total: 5
#Main genome contig total: 5
#Main genome scaffold sequence total: 146.263 MB
#Main genome contig sequence total: 146.263 MB 0.000% gap
#Main genome scaffold N/L50: 2/42.083 MB
#Main genome contig N/L50: 2/42.083 MB
#Main genome scaffold N/L90: 5/15.892 MB
#Main genome contig N/L90: 5/15.892 MB
#Max scaffold length: 52.495 MB
#Max contig length: 52.495 MB
#Number of scaffolds > 50 KB: 5
#% main genome in scaffolds > 50 KB: 100.00%
#Minimum Number Number Total Total Scaffold
#Scaffold of of Scaffold Contig Contig
#Length Scaffolds Contigs Length Length Coverage
#-------- -------------- -------------- -------------- -------------- --------
# All 5 5 146,263,292 146,263,292 100.00%
# 25 KB 5 5 146,263,292 146,263,292 100.00%
# 50 KB 5 5 146,263,292 146,263,292 100.00%
# 100 KB 5 5 146,263,292 146,263,292 100.00%
# 250 KB 5 5 146,263,292 146,263,292 100.00%
# 500 KB 5 5 146,263,292 146,263,292 100.00%
# 1 MB 5 5 146,263,292 146,263,292 100.00%
# 2.5 MB 5 5 146,263,292 146,263,292 100.00%
# 5 MB 5 5 146,263,292 146,263,292 100.00%
# 10 MB 5 5 146,263,292 146,263,292 100.00%
# 25 MB 2 2 94,577,088 94,577,088 100.00%
# 50 MB 1 1 52,494,515 52,494,515 100.00%
~/bin/gfatools/gfatools gfa2fa hifi-30x.fasta.bp.hap2.p_ctg.gfa |~/bin/bbmap-38.90/bbstats.sh in=stdin
#A C G T N IUPAC Other GC GC_stdev
#0.2982 0.2016 0.2021 0.2981 0.0000 0.0000 0.0000 0.4037 0.0000
#Main genome scaffold total: 1
#Main genome contig total: 1
#Main genome scaffold sequence total: 146.265 MB
#Main genome contig sequence total: 146.265 MB 0.000% gap
#Main genome scaffold N/L50: 1/146.265 MB
#Main genome contig N/L50: 1/146.265 MB
#Main genome scaffold N/L90: 1/146.265 MB
#Main genome contig N/L90: 1/146.265 MB
#Max scaffold length: 146.265 MB
#Max contig length: 146.265 MB
#Number of scaffolds > 50 KB: 1
#% main genome in scaffolds > 50 KB: 100.00%
#Minimum Number Number Total Total Scaffold
#Scaffold of of Scaffold Contig Contig
#Length Scaffolds Contigs Length Length Coverage
#-------- -------------- -------------- -------------- -------------- --------
# All 1 1 146,265,156 146,265,156 100.00%
# 25 KB 1 1 146,265,156 146,265,156 100.00%
# 50 KB 1 1 146,265,156 146,265,156 100.00%
# 100 KB 1 1 146,265,156 146,265,156 100.00%
# 250 KB 1 1 146,265,156 146,265,156 100.00%
# 500 KB 1 1 146,265,156 146,265,156 100.00%
# 1 MB 1 1 146,265,156 146,265,156 100.00%
# 2.5 MB 1 1 146,265,156 146,265,156 100.00%
# 5 MB 1 1 146,265,156 146,265,156 100.00%
# 10 MB 1 1 146,265,156 146,265,156 100.00%
# 25 MB 1 1 146,265,156 146,265,156 100.00%
# 50 MB 1 1 146,265,156 146,265,156 100.00%
# 100 MB 1 1 146,265,156 146,265,156 100.00%
# get haplotype 0 from mutate.sh 38.90
~/bin/seqtk/seqtk seq -l0 GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.fasta.gz | \
head -n 2 > GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.hap0.fasta
# get haplotype 1 from mutate.sh 38.90
~/bin/seqtk/seqtk seq -l0 GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.fasta.gz | \
tail -n +3 > GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.hap1.fasta
# get hifiasm's haplotype2 into FASTA
~/bin/gfatools/gfatools gfa2fa hifi-30x.fasta.bp.hap2.p_ctg.gfa \
> hifi-30x.fasta.bp.hap2.p_ctg.defaults.but.r4.fa
# DipCall 0.2 https://github.com/lh3/dipcall/releases/tag/v0.2
# note had to remove line 100 from dipcall-aux.js
# see https://github.com/lh3/dipcall/issues/1
dipcall.kit/run-dipcall hifi-30x.fasta.bp.hap2.p_ctg.defaults.but.r4.fa \
hifi-30x.fasta.bp.hap2.p_ctg.defaults.but.r4.fa \
GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.hap0.fasta \
GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.hap1.fasta \
> hifi-30x.fasta.bp.hap2.p_ctg.defaults.but.r4.fa.mk
# run DipCall
make -j 1 -f hifi-30x.fasta.bp.hap2.p_ctg.defaults.but.r4.fa.mk \
> hifi-30x.fasta.bp.hap2.p_ctg.defaults.but.r4.fa.mk.log 2>&1
# dipsum
dipcall.kit/k8 dipcall.kit/dipcall-aux2.js dipsum \
hifi-30x.fasta.bp.hap2.p_ctg.defaults.but.r4.fa.dip.bed \
hifi-30x.fasta.bp.hap2.p_ctg.defaults.but.r4.fa.dip.vcf.gz
#Length of confident regions: 146265156
# Hom SNP: 29
# Hom INS: 32
# Hom DEL: 58
# Het SNP: 2804524
# Het INS: 67702
# Het DEL: 68184
# Het mixed: 3
#SNP heterozygosity: 0.019174
#Variant heterozygosity: 0.020103
# There should be 2942229 total_Het_variants (SNPs and indels).
# There are 2940413 dipcall_Het_variants, missing 1816 (total_het_variants-dipcall_Het_variants=1816).
# There are 119 Hom variants (errors in hifiasm assembly?).
# Phred quality score for missing Het variants = -10*LOG10(1816/146265156)=49.060
# Phred quality score for wrong? Hom variants = -10*LOG10(119/146265156)=60.896
Not really an issue, just showing some performance metrics of
hifiasm
with a simulated diploid assembly of the T2T chr8 sequence, with simulated PacBio HiFi reads. Used defaulthifiasm
settings except one more round of error correction and-l3
purging.Originally wrote Q30-Q40 when actually the next command makes Q20-Q30 reads
added trio evaluation