Closed mbhall88 closed 3 years ago
I have investigated this in the below notebook
import pysam
from typing import List, Set, Dict, Tuple, Optional, NamedTuple
from pathlib import Path
from collections import defaultdict, Counter
from cyvcf2 import VCF, Variant
import warnings
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("ggplot")
N = "N"
def is_diff(c1: str, c2: str) -> bool:
return c1 != c2 and all(c != N for c in [c1, c2])
def diff_idxs(s1: str, s2: str) -> Set[int]:
assert len(s1) == len(s2)
return {i + 1 for i in range(len(s1)) if is_diff(s1[i], s2[i])}
class Genotype(NamedTuple):
allele1: int
allele2: int
def is_null(self) -> bool:
"""Is the genotype null. i.e. ./."""
return self.allele1 == -1 and self.allele2 == -1
def is_hom(self) -> bool:
"""Is the genotype homozygous"""
if self.is_null():
return False
if self.allele1 == -1 or self.allele2 == -1:
return True
return self.allele1 == self.allele2
def is_het(self) -> bool:
"""Is the genotype heterozyhous"""
return not self.is_null() and not self.is_hom()
def is_hom_ref(self) -> bool:
"""Is genotype homozygous reference?"""
return self.is_hom() and (self.allele1 == 0 or self.allele2 == 0)
def is_hom_alt(self) -> bool:
"""Is genotype homozygous alternate?"""
return self.is_hom() and (self.allele1 > 0 or self.allele2 > 0)
def alt_index(self) -> Optional[int]:
"""If the genotype is homozygous alternate, returns the 0-based index of the
alt allele in the alternate allele array.
"""
if not self.is_hom_alt():
return None
return max(self.allele1, self.allele2) - 1
@staticmethod
def from_arr(arr: List[int]) -> "Genotype":
alleles = [a for a in arr if type(a) is int]
if len(alleles) < 2:
alleles.append(-1)
return Genotype(*alleles)
REF = "mada_1-11"
We get the positions that differ between REF
and the other samples. If either position has an N
then we consider them not different.
pandora_reffa = pysam.FastaFile(f"pandora/{REF}.consensus.fa")
pandora_refseq = pandora_reffa.fetch(REF)
pandora_fasta_files = [p for p in Path("pandora").glob("*.fa") if REF not in p.name]
pandora_diff_positions: Dict[str, Set[int]] = dict()
for file in pandora_fasta_files:
sample = file.name.split(".")[0]
seq = pysam.FastaFile(file).fetch(sample)
pandora_diff_positions[sample] = diff_idxs(pandora_refseq, seq)
for k, v in pandora_diff_positions.items():
print(f"{k}: {len(v)} dist.")
mada_1-28: 157 dist.
mada_115: 155 dist.
mada_2-42: 149 dist.
mada_142: 147 dist.
mada_125: 157 dist.
mada_1-8: 159 dist.
Now that we have the positions, we gather the VCF entries for those positions
pandora_refvcf = VCF(f"pandora/{REF}.snps.filtered.bcf")
pandora_vcf_files = [p for p in Path("pandora").glob("*.bcf") if REF not in p.name]
pandora_diff_vars: Dict[str, List[Tuple[Optional[Variant], Optional[Variant]]]] = defaultdict(list)
chrom = pandora_refvcf.seqnames[0]
for file in pandora_vcf_files:
sample = file.name.split(".")[0]
idxs = pandora_diff_positions[sample]
vcf = VCF(file)
for i in idxs:
region = f"{chrom}:{i}-{i}"
try:
v = next(vcf(region))
except StopIteration:
v = None
try:
rv = next(pandora_refvcf(region))
except StopIteration:
rv = None
pandora_diff_vars[sample].append((v, rv))
Get compass "truth" positions
compass_reffa = pysam.FastaFile(f"compass/{REF}.consensus.fa")
compass_refseq = compass_reffa.fetch(REF)
compass_fasta_files = [p for p in Path("compass").glob("*.fa") if REF not in p.name]
compass_diff_positions: Dict[str, Set[int]] = dict()
for file in compass_fasta_files:
sample = file.name.split(".")[0]
seq = pysam.FastaFile(file).fetch(sample)
compass_diff_positions[sample] = diff_idxs(compass_refseq, seq)
for k, v in compass_diff_positions.items():
print(f"{k}: {len(v)} dist.")
mada_1-28: 9 dist.
mada_115: 6 dist.
mada_2-42: 15 dist.
mada_142: 8 dist.
mada_125: 8 dist.
mada_1-8: 12 dist.
Get compass "truth" difference variants
compass_refvcf = VCF(f"compass/{REF}.compass.bcf")
compass_vcf_files = [p for p in Path("compass").glob("*.bcf") if REF not in p.name]
compass_diff_vars: Dict[str, List[Tuple[Optional[Variant], Optional[Variant]]]] = defaultdict(list)
chrom = compass_refvcf.seqnames[0]
for file in compass_vcf_files:
sample = file.name.split(".")[0]
idxs = compass_diff_positions[sample]
vcf = VCF(file)
for i in idxs:
region = f"{chrom}:{i}-{i}"
try:
v = next(vcf(region))
except StopIteration:
v = None
try:
rv = next(compass_refvcf(region))
except StopIteration:
rv = None
compass_diff_vars[sample].append((v, rv))
Before we go any further, lets take a look at the distance between the pandora and compass consensus sequnces for REF
ref_diff_idxs = diff_idxs(pandora_refseq, compass_refseq)
print(f"{REF} pandora x compass dist: {len(ref_diff_idxs)}")
mada_1-11 pandora x compass dist: 79
Lets see how many of those positions are in the differences between REF
and the other samples
for sample, diff_positions in pandora_diff_positions.items():
in_both = diff_positions & ref_diff_idxs
print(f"{sample}: {len(in_both)/len(diff_positions)*100:.2f}% of positions are in the differences between pandora and compass")
mada_1-28: 4.46% of positions are in the differences between pandora and compass
mada_115: 3.87% of positions are in the differences between pandora and compass
mada_2-42: 3.36% of positions are in the differences between pandora and compass
mada_142: 0.68% of positions are in the differences between pandora and compass
mada_125: 3.18% of positions are in the differences between pandora and compass
mada_1-8: 3.77% of positions are in the differences between pandora and compass
Lets also look at what the number of differences between pandora and compass for the other samples
caller_diff_positions: Dict[str, Set[int]] = dict()
for sample in compass_diff_vars:
pseq = pysam.FastaFile(f"pandora/{sample}.consensus.fa").fetch(sample)
cseq = pysam.FastaFile(f"compass/{sample}.consensus.fa").fetch(sample)
caller_diff_positions[sample] = diff_idxs(cseq, pseq)
for k, v in caller_diff_positions.items():
print(f"{k}: {len(v)} dist.")
mada_2-42: 93 dist.
mada_115: 93 dist.
mada_1-8: 75 dist.
mada_1-28: 92 dist.
mada_142: 85 dist.
mada_125: 89 dist.
Ok, so the difference of 79 between mada_1-11
for the two callers seems "reasonable"
Lets take a look at how many of the compass differences pandora called.
for sample, cposs in compass_diff_positions.items():
pposs = pandora_diff_positions[sample]
in_both = cposs & pposs
perc = len(in_both)/len(cposs)
print(f"{sample}: {len(in_both)}/{len(cposs)} ({perc:.2%}) truth positions called by pandora")
mada_1-28: 7/9 (77.78%) truth positions called by pandora
mada_115: 5/6 (83.33%) truth positions called by pandora
mada_2-42: 11/15 (73.33%) truth positions called by pandora
mada_142: 6/8 (75.00%) truth positions called by pandora
mada_125: 5/8 (62.50%) truth positions called by pandora
mada_1-8: 7/12 (58.33%) truth positions called by pandora
Organise the difference positions into classification categorys:
pos_data = []
for sample, pposs in pandora_diff_positions.items():
cposs = compass_diff_positions[sample]
tps = cposs & pposs
fps = pposs - cposs
fns = cposs - pposs
for p in tps:
pos_data.append((sample, p, "TP"))
for p in fps:
pos_data.append((sample, p, "FP"))
for p in fns:
pos_data.append((sample, p, "FN"))
df = pd.DataFrame(pos_data, columns=["sample", "pos", "classification"])
How many of the false positives are shared across multiple pairs?
fp_df = df.query("classification == 'FP'")
sizes = fp_df.groupby("pos").size()
num_fps = len(sizes)
for i in range(len(pandora_diff_positions)):
l = len(sizes.where(sizes == i+1).dropna())
print(f"{l} ({l/num_fps:.2%}) FP positions occur in {i+1} pairs")
21 (11.86%) FP positions occur in 1 pairs
4 (2.26%) FP positions occur in 2 pairs
8 (4.52%) FP positions occur in 3 pairs
5 (2.82%) FP positions occur in 4 pairs
24 (13.56%) FP positions occur in 5 pairs
115 (64.97%) FP positions occur in 6 pairs
fps_shared_all = set(sizes.where(sizes == 6).dropna().index)
How many of the false positive positions are missing from the mada_1-11
pandora VCF? i.e. we assume REF in the consensus sequence for mada_1-11
.
def pos_in_vcf(pos: int, vcf: VCF, chrom: str) -> bool:
return next(vcf(f"{chrom}:{pos}-{pos}"), None) is not None
fp_poss = set(fp_df["pos"])
fps_in_ref = set()
fps_not_in_ref = set()
for pos in fp_poss:
if pos_in_vcf(pos, pandora_refvcf, chrom):
fps_in_ref.add(pos)
else:
fps_not_in_ref.add(pos)
print(f"{len(fps_in_ref)} FPs are in {REF}")
print(f"{len(fps_not_in_ref)} FPs are not in {REF}")
81 FPs are in mada_1-11
96 FPs are not in mada_1-11
for sample, variants in pandora_diff_vars.items():
missing = 0
n = 0
for v, rv in variants:
pos = v.POS if v is not None else rv.POS
if pos not in fp_poss:
continue
if rv is None or v is None:
missing += 1
n += 1
print(f"{sample}: {missing}/{n} ({missing/n:.2%}) FPs are due to one sample missing a position")
mada_1-28: 84/148 (56.76%) FPs are due to one sample missing a position
mada_125: 87/151 (57.62%) FPs are due to one sample missing a position
mada_115: 84/149 (56.38%) FPs are due to one sample missing a position
mada_1-8: 85/150 (56.67%) FPs are due to one sample missing a position
mada_2-42: 73/137 (53.28%) FPs are due to one sample missing a position
mada_142: 75/140 (53.57%) FPs are due to one sample missing a position
What do we say at false negative positions?
fn_df = df.query("classification == 'FN'")
for idx, row in fn_df.iterrows():
vcf = VCF(f"pandora/{row['sample']}.snps.filtered.bcf")
v = next(vcf(f"{chrom}:{row.pos}:{row.pos}"), None)
rv = next(pandora_refvcf(f"{chrom}:{row.pos}:{row.pos}"), None)
print(f"{sample} x {REF} FN at position {row.pos}")
print(f"{sample}: {str(v)}")
print(f"{REF}: {str(rv)}")
mada_142 x mada_1-11 FN at position 3901414
mada_142: None
mada_1-11: None
mada_142 x mada_1-11 FN at position 499855
mada_142: None
mada_1-11: None
mada_142 x mada_1-11 FN at position 499855
mada_142: None
mada_1-11: None
mada_142 x mada_1-11 FN at position 3970728
mada_142: None
mada_1-11: None
mada_142 x mada_1-11 FN at position 499855
mada_142: None
mada_1-11: None
mada_142 x mada_1-11 FN at position 3794956
mada_142: None
mada_1-11: None
mada_142 x mada_1-11 FN at position 3241591
mada_142: None
mada_1-11: None
mada_142 x mada_1-11 FN at position 621331
mada_142: None
mada_1-11: None
mada_142 x mada_1-11 FN at position 499855
mada_142: None
mada_1-11: None
mada_142 x mada_1-11 FN at position 4400515
mada_142: None
mada_1-11: None
mada_142 x mada_1-11 FN at position 1663869
mada_142: None
mada_1-11: None
mada_142 x mada_1-11 FN at position 499855
mada_142: None
mada_1-11: None
mada_142 x mada_1-11 FN at position 2895842
mada_142: None
mada_1-11: None
mada_142 x mada_1-11 FN at position 3863814
mada_142: None
mada_1-11: None
mada_142 x mada_1-11 FN at position 3115848
mada_142: None
mada_1-11: None
mada_142 x mada_1-11 FN at position 1102162
mada_142: None
mada_1-11: None
mada_142 x mada_1-11 FN at position 3751604
mada_142: None
mada_1-11: None
no intervals found for b'pandora/mada_1-28.snps.filtered.bcf' at NC_000962.3:3901414:3901414
no intervals found for b'pandora/mada_1-11.snps.filtered.bcf' at NC_000962.3:3901414:3901414
no intervals found for b'pandora/mada_1-28.snps.filtered.bcf' at NC_000962.3:499855:499855
no intervals found for b'pandora/mada_1-11.snps.filtered.bcf' at NC_000962.3:499855:499855
no intervals found for b'pandora/mada_115.snps.filtered.bcf' at NC_000962.3:499855:499855
no intervals found for b'pandora/mada_1-11.snps.filtered.bcf' at NC_000962.3:499855:499855
no intervals found for b'pandora/mada_2-42.snps.filtered.bcf' at NC_000962.3:3970728:3970728
no intervals found for b'pandora/mada_1-11.snps.filtered.bcf' at NC_000962.3:3970728:3970728
no intervals found for b'pandora/mada_2-42.snps.filtered.bcf' at NC_000962.3:499855:499855
no intervals found for b'pandora/mada_1-11.snps.filtered.bcf' at NC_000962.3:499855:499855
no intervals found for b'pandora/mada_2-42.snps.filtered.bcf' at NC_000962.3:3794956:3794956
no intervals found for b'pandora/mada_1-11.snps.filtered.bcf' at NC_000962.3:3794956:3794956
no intervals found for b'pandora/mada_2-42.snps.filtered.bcf' at NC_000962.3:3241591:3241591
no intervals found for b'pandora/mada_1-11.snps.filtered.bcf' at NC_000962.3:3241591:3241591
no intervals found for b'pandora/mada_142.snps.filtered.bcf' at NC_000962.3:621331:621331
no intervals found for b'pandora/mada_1-11.snps.filtered.bcf' at NC_000962.3:621331:621331
no intervals found for b'pandora/mada_142.snps.filtered.bcf' at NC_000962.3:499855:499855
no intervals found for b'pandora/mada_1-11.snps.filtered.bcf' at NC_000962.3:499855:499855
no intervals found for b'pandora/mada_125.snps.filtered.bcf' at NC_000962.3:4400515:4400515
no intervals found for b'pandora/mada_1-11.snps.filtered.bcf' at NC_000962.3:4400515:4400515
no intervals found for b'pandora/mada_125.snps.filtered.bcf' at NC_000962.3:1663869:1663869
no intervals found for b'pandora/mada_1-11.snps.filtered.bcf' at NC_000962.3:1663869:1663869
no intervals found for b'pandora/mada_125.snps.filtered.bcf' at NC_000962.3:499855:499855
no intervals found for b'pandora/mada_1-11.snps.filtered.bcf' at NC_000962.3:499855:499855
no intervals found for b'pandora/mada_1-8.snps.filtered.bcf' at NC_000962.3:2895842:2895842
no intervals found for b'pandora/mada_1-11.snps.filtered.bcf' at NC_000962.3:2895842:2895842
no intervals found for b'pandora/mada_1-8.snps.filtered.bcf' at NC_000962.3:3863814:3863814
no intervals found for b'pandora/mada_1-11.snps.filtered.bcf' at NC_000962.3:3863814:3863814
no intervals found for b'pandora/mada_1-8.snps.filtered.bcf' at NC_000962.3:3115848:3115848
no intervals found for b'pandora/mada_1-11.snps.filtered.bcf' at NC_000962.3:3115848:3115848
no intervals found for b'pandora/mada_1-8.snps.filtered.bcf' at NC_000962.3:1102162:1102162
no intervals found for b'pandora/mada_1-11.snps.filtered.bcf' at NC_000962.3:1102162:1102162
no intervals found for b'pandora/mada_1-8.snps.filtered.bcf' at NC_000962.3:3751604:3751604
no intervals found for b'pandora/mada_1-11.snps.filtered.bcf' at NC_000962.3:3751604:3751604
So we can see that all FN positions did not occur in any of our VCFs.
What does compass say at the false positive positions?
pandora_fp_bases = []
compass_fp_bases = []
for _, row in fp_df.iterrows():
pos = row["pos"]
sample = row["sample"]
c = pysam.FastaFile(f"pandora/{sample}.consensus.fa").fetch(sample)[pos-1]
refc = pandora_refseq[pos-1]
# print(f"{pos}:{sample} pandora: {c}\t{refc}")
pandora_fp_bases.append((c, refc, pos, sample))
c = pysam.FastaFile(f"compass/{sample}.consensus.fa").fetch(sample)[pos-1]
refc = compass_refseq[pos-1]
# print(f"{pos}:{sample} compass: {c}\t{refc}")
compass_fp_bases.append((c, refc))
ns = 0
for i in range(len(pandora_fp_bases)):
if N in compass_fp_bases[i]:
ns += 1
perc = ns / len(compass_fp_bases)
print(f"For {ns}/{len(compass_fp_bases)} ({perc:.2%}) FP positions, one of the compass bases is an '{N}'")
For 832/883 (94.22%) FP positions, one of the compass bases is an 'N'
Let's see what the nullified base was due to (i.e. filter, het, missing, or null call) Note: it can't be mask as it would have been masked in pandora too
reasons = []
for i in range(len(pandora_fp_bases)):
_, _, pos, sample = pandora_fp_bases[i]
region = f"{chrom}:{pos}-{pos}"
rv = next(compass_refvcf(region), None)
vcf = VCF(f"compass/{sample}.compass.bcf")
v = next(vcf(region), None)
if v is None or rv is None:
reasons.append("MISSING")
continue
if v.FILTER or rv.FILTER:
reasons.append("FILTER")
continue
gt = Genotype.from_arr(v.genotypes[0])
gt_ref = Genotype.from_arr(rv.genotypes[0])
if gt.is_null() or gt_ref.is_null():
reasons.append("NULL")
continue
if gt.is_het() or gt_ref.is_het():
reasons.append("HET")
continue
if gt.is_hom_ref():
reasons.append("REF")
continue
if gt.is_hom_alt():
reasons.append("ALT")
continue
raise NotImplementedError(i)
for k, count in Counter(reasons).items():
print(f"{k}:\t{count} ({count/len(fp_df):.2%})")
FILTER: 832 (94.22%)
ALT: 38 (4.30%)
REF: 13 (1.47%)
So, 94% FPs are filtered calls in compass...
What are the GT_CONF scores for the FPs?
gtconfs = []
for i in range(len(pandora_fp_bases)):
_, _, pos, sample = pandora_fp_bases[i]
region = f"{chrom}:{pos}-{pos}"
rv = next(pandora_refvcf(region), None)
vcf = VCF(f"pandora/{sample}.snps.filtered.bcf")
v = next(vcf(region), None)
if v is None:
gtconfs.append((rv.format("GT_CONF")[0][0], "FP"))
continue
if rv is None:
gtconfs.append((v.format("GT_CONF")[0][0], "FP"))
continue
gtconfs.append((rv.format("GT_CONF")[0][0], "FP"))
gtconfs.append((v.format("GT_CONF")[0][0], "FP"))
for sample in pandora_diff_positions:
for variant in VCF(f"pandora/{sample}.snps.filtered.bcf"):
if variant.POS in fp_poss or variant.FILTER:
continue
gtconfs.append((variant.format("GT_CONF")[0][0], "OTHER"))
for variant in VCF(f"pandora/{REF}.snps.filtered.bcf"):
if variant.POS in fp_poss or variant.FILTER:
continue
gtconfs.append((variant.format("GT_CONF")[0][0], "OTHER"))
gtdf = pd.DataFrame(gtconfs, columns=["gt_conf", "class"]).query("gt_conf < 700")
fig, ax = plt.subplots(figsize=(13, 8), dpi=300)
sns.histplot(data=gtdf, x="gt_conf", hue="class", bins=100, common_norm=False, stat="density")
_ = ax.set(title="GT_CONF for false positive against all other variants")
In the default approach, when pandora is missing a H37Rv position in the VCF, we assume the reference base when generating the consensus. I tried changing this to use a 'N' instead of assuming reference to see how this would change the outliers.
It does reduce the outliers a bit, but they are still massive outliers (and the rest of the results suck).
Some more information about the FPs. As we saw above, 94% of FP positions are filtered by compass. The filters at those positions are
{'K0.90': 839, 'z': 839, 'f0.35': 1, 'S25': 5, 'n5': 1}
The meanings of these filters are:
K0.90
: Base not called because proportion of the high-quality bases that supports the BCALL base call < 0.90z
: Base not called because SAMtools made an initial heterozygote call (genotype X/Y where X!=Y)f0.35
: Base not called because proportion of the high-quality mapped bases sum(BaseCounts4) < 0.35 of total spanning reads DPS25
: Base not called because min SNP quality less than 25n5
: Base not called because number of high quality bases at this position < 5So they're basically all het calls...
OK so what do the ref/alt counts look like in Pandora? Do less than 90% of reads support our calls there. If yes, such a filter would 'fix this for us (we'd need to check how many true positives the filter would remove). If no, we wonder if longer nanopore reads improve mappability.
Ok, let's see whether filtering based on FRS improves the situation (i.e. reduce FPs without losing too many TPs).
Say the coverage on the called/genotyped allele is c, and the total coverage across all alleles is C. Then FRS - fraction of read support - is c/C. ie what % of reads support the called allele.
def frs(variant: Variant, sample_idx: int = 0) -> float:
fwd_covgs = variant.format(Tags.FwdCovg.value)[sample_idx]
rev_covgs = variant.format(Tags.RevCovg.value)[sample_idx]
total_covg = sum(fwd_covgs) + sum(rev_covgs)
called_idx = Genotype.from_arr(variant.genotypes[sample_idx]).allele_index()
called_covg = fwd_covgs[called_idx] + rev_covgs[called_idx]
try:
return called_covg / total_covg
except ZeroDivisionError:
return 1.0
Let's explore a range of FRS thresholds to see how many FPs and TPs we lose.
3/883 (0.34%) FPs would be filtered with an FRS threshold of 0.5
0/41 (0.00%) TPs would be filtered with an FRS threshold of 0.5
6/883 (0.68%) FPs would be filtered with an FRS threshold of 0.55
1/41 (2.44%) TPs would be filtered with an FRS threshold of 0.55
15/883 (1.70%) FPs would be filtered with an FRS threshold of 0.6
1/41 (2.44%) TPs would be filtered with an FRS threshold of 0.6
49/883 (5.55%) FPs would be filtered with an FRS threshold of 0.65
2/41 (4.88%) TPs would be filtered with an FRS threshold of 0.65
81/883 (9.17%) FPs would be filtered with an FRS threshold of 0.7
5/41 (12.20%) TPs would be filtered with an FRS threshold of 0.7
168/883 (19.03%) FPs would be filtered with an FRS threshold of 0.75
5/41 (12.20%) TPs would be filtered with an FRS threshold of 0.75
230/883 (26.05%) FPs would be filtered with an FRS threshold of 0.8
5/41 (12.20%) TPs would be filtered with an FRS threshold of 0.8
287/883 (32.50%) FPs would be filtered with an FRS threshold of 0.85
6/41 (14.63%) TPs would be filtered with an FRS threshold of 0.85
323/883 (36.58%) FPs would be filtered with an FRS threshold of 0.9
6/41 (14.63%) TPs would be filtered with an FRS threshold of 0.9
329/883 (37.26%) FPs would be filtered with an FRS threshold of 0.91
6/41 (14.63%) TPs would be filtered with an FRS threshold of 0.91
339/883 (38.39%) FPs would be filtered with an FRS threshold of 0.92
6/41 (14.63%) TPs would be filtered with an FRS threshold of 0.92
347/883 (39.30%) FPs would be filtered with an FRS threshold of 0.93
6/41 (14.63%) TPs would be filtered with an FRS threshold of 0.93
370/883 (41.90%) FPs would be filtered with an FRS threshold of 0.94
7/41 (17.07%) TPs would be filtered with an FRS threshold of 0.94
387/883 (43.83%) FPs would be filtered with an FRS threshold of 0.95
9/41 (21.95%) TPs would be filtered with an FRS threshold of 0.95
423/883 (47.90%) FPs would be filtered with an FRS threshold of 0.96
12/41 (29.27%) TPs would be filtered with an FRS threshold of 0.96
468/883 (53.00%) FPs would be filtered with an FRS threshold of 0.97
14/41 (34.15%) TPs would be filtered with an FRS threshold of 0.97
516/883 (58.44%) FPs would be filtered with an FRS threshold of 0.98
17/41 (41.46%) TPs would be filtered with an FRS threshold of 0.98
554/883 (62.74%) FPs would be filtered with an FRS threshold of 0.99
23/41 (56.10%) TPs would be filtered with an FRS threshold of 0.99
882/883 (99.89%) FPs would be filtered with an FRS threshold of 1.0
41/41 (100.00%) TPs would be filtered with an FRS threshold of 1.0
Plots are always better
And absolute numbers
Note the numbers presented here are across the six outlier pairs
Interesting that 839 FPs fail the frs 0.9 filter in COMPASS, but only 323 for Pandora (obviously one is nanopore and other is illumina, so not the same data)
Anyway, great news
https://github.com/mbhall88/head_to_head_pipeline/issues/7#issuecomment-778126222 and the following plots show that for pandora and bcftools there we are overcalling 6 pairs of samples. All 6 have one sample in common:
mada_1-11
.