Closed nicolerg closed 5 years ago
You can simply ignore raw peak stats. We will hide it from the report in the next release v1.1.6 (commit e68b6b85153b113fb34ca3a0ddf62ede25c5bd0e). It's redundant.
Can you also include both IDR and naive peak set stats in the JSON?
Yes good idea. We should do that in the JSON
On Thu, Jan 10, 2019, 2:20 PM nicolerg <notifications@github.com wrote:
Can you also include both IDR and naive peak set stats in the JSON?
— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/ENCODE-DCC/atac-seq-pipeline/issues/74#issuecomment-453219919, or mute the thread https://github.com/notifications/unsubscribe-auth/AAI7EepiTCrXcuu5QtKRJrgi2E21NlkNks5vB5JygaJpZM4Z6Hrc .
Hi @leepc12, was this issue fixed across all implementations of the pipeline? I no longer get erroneous raw peak ataqc stats when I run the pipeline locally with conda, but a collaborator who ran the v1.1.7 pipeline on GCP has the same bug in her JSON report.
Yes, this is already fixed in v1.1.6.
Example qc.json
from v1.1.6 run:
https://github.com/ENCODE-DCC/atac-seq-pipeline/blob/master/test/test_workflow/ref_output/v1.1.6/ENCSR356KRQ_subsampled/qc.json#L427
Can you upload her qc.json
file?
Just this line:
"Raw peaks": [
74269,
"OK"
],
{
"general": {
"date": "2019-03-20 10:11:14",
"pipeline_ver": "v1.1.7",
"peak_caller": "macs2",
"genome": "rn6.tsv",
"description": "ATAC-seq on MoTrPAC",
"title": "20180725_5_Liver_002_powder_S4",
"paired_end": [
true
]
},
"flagstat_qc": {
"rep1": {
"total": 92694432,
"total_qc_failed": 0,
"duplicates": 0,
"duplicates_qc_failed": 0,
"mapped": 92123522,
"mapped_qc_failed": 0,
"mapped_pct": 99.38,
"paired": 51743818,
"paired_qc_failed": 0,
"read1": 25871909,
"read1_qc_failed": 0,
"read2": 25871909,
"read2_qc_failed": 0,
"paired_properly": 50741198,
"paired_properly_qc_failed": 0,
"paired_properly_pct": 98.06,
"with_itself": 51103330,
"with_itself_qc_failed": 0,
"singletons": 69578,
"singletons_qc_failed": 0,
"singletons_pct": 0.13,
"diff_chroms": 78236,
"diff_chroms_qc_failed": 0
}
},
"dup_qc": {
"rep1": {
"unpaired_reads": 0,
"paired_reads": 20329998,
"unmapped_reads": 0,
"unpaired_dupes": 0,
"paired_dupes": 6863253,
"paired_opt_dupes": 6353,
"dupes_pct": 0.337592
}
},
"pbc_qc": {
"rep1": {
"total_read_pairs": 20329998,
"distinct_read_pairs": 13483546,
"one_read_pair": 10166059,
"two_read_pair": 2492700,
"NRF": 0.663234,
"PBC1": 0.75396,
"PBC2": 4.078332
}
},
"nodup_flagstat_qc": {
"rep1": {
"total": 26933490,
"total_qc_failed": 0,
"duplicates": 0,
"duplicates_qc_failed": 0,
"mapped": 26933490,
"mapped_qc_failed": 0,
"mapped_pct": 100.0,
"paired": 26933490,
"paired_qc_failed": 0,
"read1": 13466745,
"read1_qc_failed": 0,
"read2": 13466745,
"read2_qc_failed": 0,
"paired_properly": 26933490,
"paired_properly_qc_failed": 0,
"paired_properly_pct": 100.0,
"with_itself": 26933490,
"with_itself_qc_failed": 0,
"singletons": 0,
"singletons_qc_failed": 0,
"singletons_pct": 0.0,
"diff_chroms": 0,
"diff_chroms_qc_failed": 0
}
},
"overlap_reproducibility_qc": {
"Nt": 0,
"N1": 127461,
"Np": 0,
"N_opt": 127461,
"N_consv": 127461,
"opt_set": "rep1-pr",
"consv_set": "rep1-pr",
"rescue_ratio": 0.0,
"self_consistency_ratio": 1.0,
"reproducibility": "pass"
},
"idr_reproducibility_qc": {
"Nt": 0,
"N1": 74269,
"Np": 0,
"N_opt": 74269,
"N_consv": 74269,
"opt_set": "rep1-pr",
"consv_set": "rep1-pr",
"rescue_ratio": 0.0,
"self_consistency_ratio": 1.0,
"reproducibility": "pass"
},
"frip_macs2_qc": {
"rep1": {
"FRiP": 0.312998501123
},
"rep1-pr1": {
"FRiP": 0.317969240676
},
"rep1-pr2": {
"FRiP": 0.322046516961
}
},
"overlap_frip_qc": {
"rep1-pr": {
"FRiP": 0.241691255014
}
},
"idr_frip_qc": {
"rep1-pr": {
"FRiP": 0.190524844719
}
},
"ataqc": {
"rep1": {
"Genome": "Rattus_norvegicus.Rnor_6.0.dna.toplevel.fa.gz",
"Paired/Single-ended": "Paired-ended",
"Read length": 75,
"Read count from sequencer": 92694432,
"Read count successfully aligned": 92123522,
"Read count after filtering for mapping quality": 59637527,
"Read count after removing duplicate reads": 52774274,
"Read count after removing mitochondrial reads (final read count)": 26933490,
"NRF": [
0.663234,
"out of range [0.8, inf]"
],
"PBC1": [
0.75396,
"out of range [0.8, inf]"
],
"PBC2": [
4.078332,
"OK"
],
"picard_est_library_size": 31429662,
"Fraction of reads in NFR": [
0.499886971921,
"OK"
],
"NFR / mono-nuc reads": [
1.53787093978,
"out of range [2.5, inf]"
],
"Presence of NFR peak": "OK",
"Presence of Mono-Nuc peak": "OK",
"Presence of Di-Nuc peak": "OK",
"Raw peaks": [
74269,
"OK"
],
"Naive overlap peaks": [
127461,
"OK"
],
"IDR peaks": [
74269,
"OK"
],
"Naive peak stats: Min size": 150.0,
"Naive peak stats: 25 percentile": 325.0,
"Naive peak stats: 50 percentile (median)": 495.0,
"Naive peak stats: 75 percentile": 760.0,
"Naive peak stats: Max size": 4715.0,
"Naive peak stats: Mean": 577.525611756,
"IDR peak stats: Min size": 150.0,
"IDR peak stats: 25 percentile": 441.0,
"IDR peak stats: 50 percentile (median)": 651.0,
"IDR peak stats: 75 percentile": 913.0,
"IDR peak stats: Max size": 4715.0,
"IDR peak stats: Mean": 706.594002881,
"TSS_enrichment": 6.25728227255
}
}
}
In the example you linked to, why is the number of raw peaks less than the number of IDR peaks?
Thanks for reporting this. The term Raw peaks
is internally equal to "IDR peaks" in the ataqc module. We will remove it in the next release or hotfix.
It looks like IDR peaks are being used to generate raw peak statistics. The peak statistics for the raw peaks and IDR peaks are identical in the HTML report, and the reported numbers of raw peaks and IDR peaks are the same. Screenshots from the HTML report for one sample are below.
There is one set of peak statistics given at the end of the
ataqc
section of the QC JSON, but it does not specify which peak set those statistics are for. Peak statistics for Naive overlap peaks and IDR peaks are not explicitly given in the JSON. See snippet below.OS/Platform and dependencies