AfshinLab / BLR

MIT License
5 stars 0 forks source link

Update blr-testdata to v0.5 #27

Closed pontushojer closed 4 years ago

pontushojer commented 4 years ago

Make testdata more reproducible https://github.com/FrickTobias/BLR/issues/207

FASTQs and barcodes are now part of the blr-testdata Snakefile.

FrickTobias commented 4 years ago

Looks great. For future reading I think it is a bit unclear what changes have been made from 0.4.2 to 0.5, so maybe just add a bullet point list of changes. I'm assuming it's essentially what is ticked in the issue, but those lists won't be logged for future backtracking (with time-stamps of what the state was when we merged this PR).

pontushojer commented 4 years ago

Here is a diff on the Snakefile comparing v0.4.2 to v0.5

The main change here is that the source for all testdata BAMs and FASTQs have been changed. Some were missing and others were not that simple to regenerate.

Files updated in this version

Snakefile changes

$ git diff v0.4.2 v0.5 Snakefile
diff --git a/Snakefile b/Snakefile
index 81e760f..1e9601e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1,13 +1,38 @@
 # This is actually GCA_000001405.15_GRCh38_no_alt_analysis_set.fna
 genome = "/proj/uppstore2018173/private/references/bowtie2/genome.fa"

+# This was used for test datasets up to 0.3 (the BAM file doesn’t exist anymore)
+# bam = "/proj/uppstore2018173/private/analysis/190423.HiSeq.emTn5.Next.reseq_3.XIII/XIII.reseq_3.R1.fastq.so
+# fastq = "/proj/uppstore2018173/private/rawdata/190416.HiSeq.emTn5.Next.reseq_2-3.XII-XVII/XIII.reseq_3.R{nr
+
+bam = "/proj/uppstore2018173/private/analysis/200529.P14314.contDNA.rerun/mapped.sorted.tag.bcmerge.mkdup.mol
+fastq = "/proj/uppstore2018173/private/rawdata/191010.NovoSeq.emTn5.Next.Mus.Patient.Cont/P14314/P14314_1006/
+bam_tellseq = "/proj/uppstore2018173/private/pontus/Chen_HG002_NA24385_TELLseq_5ng_mapping/out.bam"
+fastq_tellseq = "/proj/uppstore2018173/private/references/Chen_HG002_NA24385_TELLseq_5ng_fastqs/190704_A502_N
+tellseq_index = "/proj/uppstore2018173/private/references/Chen_HG002_NA24385_TELLseq_5ng_fastqs/190704_A502_N
+bam_stlfr = "/proj/uppstore2018173/private/pontus/GIAB_HG002_NA24385_BGI_stLFR/out.bam"
+fastq_stlfr = "/proj/uppstore2018173/private/references/GIAB_HG002_NA24385_BGI_stLFR_fastqs/stLFR_NA24385_spl
+bam_tenx = "/proj/uppstore2018173/private/pontus/GIAB_HG002_NA24385_10xChromium/ema_final.bam"
+fastq_tenx = "/proj/uppstore2018173/private/pontus/GIAB_HG002_NA24385_10xChromium/read-RA_si-AAGTTGCA_lane-00
+
 rule all:
-    input:
-        "ref.fasta",
-        "HG002_GRCh38_GIAB_highconf.vcf",
-        "HG002_GRCh38_GIAB_highconf_triophased.vcf",
+  input:
+    "ref.fasta",
+    "blr_reads.1.fastq.gz",
+    "blr_reads.2.fastq.gz",
+    "tellseq_reads.1.fastq.gz",
+    "tellseq_reads.2.fastq.gz",
+    "tellseq_index.fastq.gz",
+    "stlfr_reads.1.fastq.gz",
+    "stlfr_reads.2.fastq.gz",
+    "stlfr_barcodes.txt",
+    "tenx_reads.1.fastq.gz",
+    "tenx_reads.2.fastq.gz",
+    "tenx_barcode_whitelist.txt",
+    "HG002_GRCh38_GIAB_highconf.vcf",
+    "HG002_GRCh38_GIAB_highconf_triophased.vcf"

-rule:
+rule reference:
     # chromosome sizes in kbp: 50, 40, 9, 1
     output: "ref.fasta"
     shell:
@@ -17,6 +42,72 @@ rule:
         "samtools faidx {genome} chr1:10099001-10100000 | sed -r 's/^>.*/>chrD/' >> {output}.tmp\n"
         "mv {output}.tmp {output}"

+def bam_input(wildcards):
+  files = {
+    "blr": bam,
+    "tellseq": bam_tellseq,
+    "stlfr": bam_stlfr,
+    "tenx": bam_tenx
+  }
+  return files[wildcards.tech]
+
+rule:
+  output:
+    bam=temp("{tech}_out.bam")
+  input: bam=bam_input
+  shell: "samtools view -b -o {output.bam} {input.bam} chr1:10000000-10100000"
+
+rule:
+  output:
+    txt=temp("{tech}_names.txt")
+  input: bam="{tech}_out.bam"
+  run:
+    command = "samtools view {input.bam} | cut -f1 "
+    if wildcards.tech == "stlfr":
+      command += "| shuf -n 10000 --random-source {input.bam}"
+    elif wildcards.tech == "blr":
+      command += "| sed 's|_FILTERED| |' "
+
+    shell(command + "> {output.txt}")
+
+def fastq_input(wildcards):
+  files = {
+    "blr": fastq,
+    "tellseq": fastq_tellseq,
+    "stlfr": fastq_stlfr
+  }
+  return files[wildcards.tech]
+
+rule reads:
+  output:
+    fastq_gz="{tech}_reads.{nr,(1|2)}.fastq.gz"
+  input:
+    names="{tech}_names.txt",
+    fastq=fastq_input,
+  wildcard_constraints:
+    tech="(blr|stlfr|tellseq)"
+  shell:
+    "pigz -dc {input.fastq} | grep -F -A 3 --no-group-separator -f {input.names} | gzip -9 > {output.fastq_gz
+
+rule reads_tenx:
+  output:
+    fastq1="tenx_reads.1.fastq.gz",
+    fastq2="tenx_reads.2.fastq.gz"
+  input:
+    names="tenx_names.txt",
+    fastq=fastq_tenx,
+  shell:
+    "pigz -dc {input.fastq} | grep -F -A 3 --no-group-separator -f {input.names} | "
+    "paste - - - - - - - - | tee >(cut -f 1-4 | tr '\t' '\n' | gzip -9 > {output.fastq1}) | cut -f 5-8 | "
+    "tr '\t' '\n' | gzip -9 > {output.fastq2}"
+
+rule index_tellseq:
+  output: "{tech}_index.fastq.gz"
+  input: 
+    fastq=tellseq_index,
+    names="{tech}_names.txt"
+  shell:
+    "pigz -dc {input.fastq} | grep -F -A 3 --no-group-separator -f {input.names} | gzip -9 > {output}"

 rule download_giab_highconf:
     output:
@@ -24,17 +115,17 @@ rule download_giab_highconf:
     shell:
         "wget https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/AshkenazimTrio/HG002_NA24385_son/NISTv3.3.2

-
 rule download_giab_highconf_triophased:
     output:
         "HG002_GRCh38_GIAB_highconf_CG-Illfb-IllsentieonHC-Ion-10XsentieonHC-SOLIDgatkHC_CHROM1-22_v.3.3.2_hi
     shell:
         "wget https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/AshkenazimTrio/HG002_NA24385_son/NISTv3.3.2

-
 rule highconf:
     output:
-        vcf="HG002_GRCh38_GIAB_{base,(highconf|highconf_triophased)}.vcf"
+        vcf="HG002_GRCh38_GIAB_{base,(highconf|highconf_triophased)}.vcf",
+        log=temp("HG002_GRCh38_GIAB_{base,(highconf|highconf_triophased)}.vcf.tmp.log"),
+        recode=temp("HG002_GRCh38_GIAB_{base,(highconf|highconf_triophased)}.vcf.tmp.recode.vcf")
     input:
         vcf="HG002_GRCh38_GIAB_highconf_CG-Illfb-IllsentieonHC-Ion-10XsentieonHC-SOLIDgatkHC_CHROM1-22_v.3.3.
     shell:
@@ -67,3 +158,58 @@ rule highconf:
             }}
             1
             ' {output.vcf}.tmp.recode.vcf > {output.vcf}"""
+
+rule download_stlfr_barcodes:
+  output:
+    "stlfr_barcodes.txt"
+  shell:
+    "wget -O {output} https://raw.githubusercontent.com/stLFR/stLFR_read_demux/master/scripts/barcode.list"
+
+rule download_tenx_barcodes:
+  output:
+    temp("tenx_barcodes_raw.txt")
+  shell:
+    "wget -O {output} https://github.com/10XGenomics/longranger/raw/master/tenkit/lib/python/tenkit/barcodes/
+
+
+rule count_tenx:
+    """Create list of per-barcode count"""
+    output:
+        counts_ncnt = temp("reads.ema-ncnt"),
+        counts_fcnt = temp("reads.ema-fcnt")
+    input:
+        r1_fastq="tenx_reads.1.fastq.gz",
+        r2_fastq="tenx_reads.2.fastq.gz",
+        whitelist="tenx_barcodes_raw.txt"
+    shell:
+        "paste <(pigz -c -d {input.r1_fastq} | paste - - - -) <(pigz -c -d {input.r2_fastq} | paste - - - -) 
+        " tr '\t' '\n' |"
+        " ema count"
+        " -w {input.whitelist}"
+        " -o reads"
+
+rule preproc_10x:
+    """Trim reads and bin reads containing the same barcode together. Reads missing barcodes outputed to ema-
+    output:
+        bins = temp(directory("temp_bins"))
+    input:
+        r1_fastq="tenx_reads.1.fastq.gz",
+        r2_fastq="tenx_reads.2.fastq.gz",
+        counts_ncnt = "reads.ema-ncnt",
+        counts_fcnt = "reads.ema-fcnt",
+        whitelist = "tenx_barcodes_raw.txt"
+    shell:
+        "paste <(pigz -c -d {input.r1_fastq} | paste - - - -) <(pigz -c -d {input.r2_fastq} | paste - - - -) 
+        " tr '\t' '\n' |"
+        " ema preproc"
+        " -w {input.whitelist}"
+        " -b"
+        " -h"
+        " -o {output.bins} {input.counts_ncnt}"
+
+rule barcode_subset:
+     output: "tenx_barcode_whitelist.txt"
+     input: "temp_bins"
+     shell:
+        "cat {input}/ema-bin-* | paste - - - - | cut -f1 | cut -d' ' -f2 |"
+        " sed 's|BX:Z:||' | sed 's|-1| |' | sort | uniq > {output}"