/*
Build a squared gene x subject matrix for heatmap plotting
1. Get selected tumor pairs as described elsewhere (`selected_tumor_pairs`)
- reduce this list to a list of aliquots, combining (a) and (b) into a single column (`selected_aliquots`). This list is only used for coverage
2. Select a list of genes and specific variants
- Take genes deemed significant using the dNdS-CV method
- Manually adding a genes that are not significant by dNdS but are known glioma genes (handpicked from Anzhela's list)
- Filter the list of variants by subsetting hotspot variants only for those genes where they are known
3. Take the cartesian product between the genes and aliquots table, generating a table with (genes x aliquots) numbers of rows (`selected_genes_samples`)
- Get coverage statistics from all these loci across all aliquots and compute median for each gene and aliquot (`gene_sample_coverage`)
- Use the tumor pairs list from step 1 to align tumor pairs to genes and coverage statistics for sample (a) and (b) from each pair (`gene_pair_coverage`)
4. Take variants from each tumor_barcode, subsetting selected tumor pairs from step 1 and by variants from step 2 (`variants_by_case_and_gene`)
- For each pair, there exists an optimal first tumor (a) and subsequent tumor sample (b)
- Aggregate over variants by case_barcode and gene_symbol, selecting only the top variant for each subject/gene combination
- Restrict to events with coverage >= 5 in both (a) and (b)
- Variants for each tumor pair/gene combination are ordered according to variant_classification_priority (see new table analysis.variant_classifications) and whether or not the mutation was called in a/b and finally based on read_depth
- Manually calling all variants based on a 0.05 VAF threshold, since we already applied stringent filtering prior to this step (eg. only variants with a mutect call in either (a) or (b) are included in the data at this point)
5. Right join the limited list of variants from step 4 with the extensive coverage statistics from step 3, generating a squared table with all genes and all subjects (`squared_variants`)
6. Perform a subject level call of whether the variant was called in sample A only (initial only), B only (recurrence only), or both (shared), or in neither mark as wild-type.
- If there is not enough coverage in either A or B, mark the gene/subject combination as NULL to indicate insufficient coverage.
*/
WITH
selected_tumor_pairs AS
(
SELECT
tumor_pair_barcode,
case_barcode,
tumor_barcode_a,
tumor_barcode_b,
row_number() OVER (PARTITION BY case_barcode ORDER BY surgical_interval_mo DESC, portion_a ASC, portion_b ASC, substring(tumor_pair_barcode from 27 for 3) ASC) AS priority
FROM analysis.tumor_pairs ps
LEFT JOIN analysis.blocklist b1 ON b1.aliquot_barcode = ps.tumor_barcode_a
LEFT JOIN analysis.blocklist b2 ON b2.aliquot_barcode = ps.tumor_barcode_b
LEFT JOIN analysis.mutation_freq mf1 ON mf1.aliquot_barcode = ps.tumor_barcode_a -- join with mutation freq to remove hypermutators
LEFT JOIN analysis.mutation_freq mf2 ON mf2.aliquot_barcode = ps.tumor_barcode_b
WHERE
comparison_type = 'longitudinal' AND
sample_type_b <> 'M1' AND -- exclude metastatic samples here because this is outside the scope of our study
--b1.cnv_exclusion = 'allow' AND b2.cnv_exclusion = 'allow' AND
b1.coverage_exclusion = 'allow' AND b2.coverage_exclusion = 'allow' AND
b1.clinical_exclusion = 'allow' AND b2.clinical_exclusion = 'allow' --AND
--mf1.coverage_adj_mut_freq < 10 AND mf2.coverage_adj_mut_freq < 10 -- filter hypermutators
),
selected_aliquots AS
(
SELECT tumor_barcode_a AS aliquot_barcode, case_barcode FROM selected_tumor_pairs WHERE priority = 1
UNION
SELECT tumor_barcode_b AS aliquot_barcode, case_barcode FROM selected_tumor_pairs WHERE priority = 1
),
selected_genes AS
(
SELECT sn.gene_symbol, chrom, pos, alt, sn.variant_classification, variant_classification_priority, hgvs_p
FROM analysis.snvs sn
INNER JOIN analysis.dndscv_gene ds ON ds.gene_symbol = sn.gene_symbol AND (ds.qglobal_cv < 0.10 OR ds.gene_symbol IN ('TERT','IDH2','NOTCH1','PDGFRA','PIK3CG','BRAF','H3F3A'))
LEFT JOIN analysis.variant_classifications vc ON sn.variant_classification = vc.variant_classification
WHERE
(sn.gene_symbol NOT IN ('TERT','IDH1','IDH2','BRAF','H3F3A') AND variant_classification_priority IS NOT NULL) OR
(sn.gene_symbol = 'TERT' AND sn.variant_classification = '5''Flank' AND lower(sn.pos) IN (1295228,1295250)) OR
(sn.gene_symbol = 'IDH1' AND sn.hgvs_p IN ('p.R132C','p.R132G','p.R132H','p.R132S')) OR
(sn.gene_symbol = 'IDH2' AND sn.hgvs_p = 'p.R172K') OR
(sn.gene_symbol = 'BRAF' AND sn.hgvs_p = 'p.V600E') OR
(sn.gene_symbol = 'H3F3A' AND sn.hgvs_p = 'p.G35R')
),
selected_genes_samples AS
(
SELECT aliquot_barcode, case_barcode, gene_symbol, chrom, lower(pos) AS start_pos, upper(pos)-1 AS end_pos, alt
FROM selected_aliquots, selected_genes
),
gene_sample_coverage AS
(
SELECT
gene_symbol,
sg.aliquot_barcode,
sg.case_barcode,
round(median(alt_count + ref_count)) AS median_cov
FROM analysis.full_genotypes fgt
INNER JOIN selected_genes_samples sg ON fgt.aliquot_barcode = sg.aliquot_barcode AND fgt.chrom = sg.chrom AND fgt.start = sg.start_pos AND fgt.end = sg.end_pos AND fgt.alt = sg.alt
GROUP BY 1,2,3
),
gene_pair_coverage AS
(
SELECT
stp.tumor_pair_barcode,
stp.case_barcode,
c1.gene_symbol,
c1.median_cov AS cov_a,
c2.median_cov AS cov_b
FROM selected_tumor_pairs stp
LEFT JOIN gene_sample_coverage c1 ON c1.aliquot_barcode = stp.tumor_barcode_a
LEFT JOIN gene_sample_coverage c2 ON c2.aliquot_barcode = stp.tumor_barcode_b AND c1.gene_symbol = c2.gene_symbol
WHERE priority = 1
),
variants_by_case_and_gene AS
(
SELECT
gtc.gene_symbol,
gtc.case_barcode,
gtc.tumor_pair_barcode,
gtc.chrom,
gtc.pos,
gtc.alt,
gtc.variant_classification,
sg.hgvs_p,
(alt_count_a::decimal / (alt_count_a+ref_count_a) > 0.05) AS selected_call_a,
(alt_count_b::decimal / (alt_count_b+ref_count_b) > 0.05) AS selected_call_b,
row_number() OVER (PARTITION BY gtc.gene_symbol, gtc.case_barcode ORDER BY mutect2_call_a::integer + mutect2_call_b::integer = 2, variant_classification_priority, mutect2_call_a::integer + mutect2_call_b::integer DESC, read_depth_a + read_depth_b DESC) AS priority
FROM analysis.master_genotype_comparison gtc
INNER JOIN selected_tumor_pairs stp ON stp.tumor_pair_barcode = gtc.tumor_pair_barcode AND stp.priority = 1
INNER JOIN selected_genes sg ON sg.chrom = gtc.chrom AND sg.pos = gtc.pos AND sg.alt = gtc.alt
WHERE
(mutect2_call_a OR mutect2_call_b) AND read_depth_a >= 5 AND read_depth_b >= 5
),
squared_variants AS
(
SELECT
gc.gene_symbol,
gc.case_barcode,
vcg.chrom,
vcg.pos,
vcg.alt,
vcg.variant_classification,
vcg.hgvs_p,
vcg.selected_call_a,
vcg.selected_call_b,
gc.cov_a,
gc.cov_b
FROM (SELECT * FROM variants_by_case_and_gene WHERE priority = 1) vcg
RIGHT JOIN gene_pair_coverage gc ON gc.tumor_pair_barcode = vcg.tumor_pair_barcode AND gc.gene_symbol = vcg.gene_symbol
)
SELECT
gene_symbol,
case_barcode,
variant_classification,
hgvs_p,
(CASE
WHEN cov_a >= 5 AND cov_b >= 5 AND selected_call_a AND selected_call_b THEN 3
WHEN cov_a >= 5 AND cov_b >= 5 AND selected_call_a AND NOT selected_call_b THEN 2
WHEN cov_a >= 5 AND cov_b >= 5 AND selected_call_b AND NOT selected_call_a THEN 1
WHEN cov_a >= 5 AND cov_b >= 5 AND (NOT selected_call_b OR selected_call_b IS NULL) AND (NOT selected_call_a OR selected_call_a IS NULL) THEN 0
ELSE NULL END) AS variant_call
FROM squared_variants
In progress