LiNk-NY / curatedTCGAManu

Multi-omic integration of public oncology databases in Bioconductor
https://link-ny.github.io/curatedTCGAManu/
4 stars 1 forks source link

Make sure checking methylation and TPM>3 in same patients #5

Open lwaldron opened 5 years ago

lwaldron commented 5 years ago

https://github.com/LiNk-NY/curatedTCGAManu/blob/6aa1c1f6b84341e20ad0035238e7b5ac6f78039a/vignettes/GMQL_vignette.Rmd#L86

lwaldron commented 5 years ago
# "In TCGA data of BRCA patients, find the DNA somatic mutations 
# within the first 2000 bp outside of the genes that are both 
# expressed with FPKM > 3 and have at least a methylation in the same patient
# biospecimen, and extract these mutations of the top 5% patients
# with the highest number of such mutations."

library(curatedTCGAData) #require >=1.5.6
system.time(mae <- curatedTCGAData("ACC", c("Mutation", "RNASeq2GeneNorm", "Methylation"), dry.run = FALSE)) 
## traditional methylation representation:
##    user  system elapsed 
## 171.516   3.109 181.086 
## HDF5 methylation representation
##  user  system elapsed 
## 6.223   0.223   9.607 

## remove methylation probes not annotated with a gene symbol
(methassay <- grep("Methylation", names(mae), val=TRUE))
methassay <- methassay[1]
meth.has.genes <- !is.na(rowData(mae[[methassay]])$Gene_Symbol)
## this syntax requires bioc-devel
mae <- subsetByRow(mae, List(`ACC_Methylation-20160128`=meth.has.genes)); rm(meth.has.genes)
genes <- rowData(mae[["ACC_Methylation-20160128"]])$Gene_Symbol

## create a data.frame for the methylation
meth <- assay(mae, "ACC_Methylation-20160128")
meth <- !is.na(meth) #NOT NA
meth <- meth * 1L #convert to integer mode

# number of non-missing per gene in each column
# meth1 <- DelayedMatrixStats::rowsum(meth, group=rowData(mae[["ACC_Methylation-20160128"]])$Gene_Symbol)
#     user   system  elapsed 
# 1322.116    5.526 1335.349 
system.time(meth2 <- DelayedArray::rowsum(meth, group=genes))

system.time(meth1 <- as.matrix(meth))
meth <- rowsum(meth, group=rowData(mae[["ACC_Methylation-20160128"]])$Gene_Symbol)

meth <- meth > 0 # logical "have at least a methylation"

## concatenate to mae. The only purpose of converting to 
## SummarizedExperiment here is to enable TCGAutils::symbolsToRanges(mae) below

mae <- c(mae, has.meth=SummarizedExperiment(meth))
rm(meth)

library(TCGAutils)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)

mae <- TCGAutils::symbolsToRanges(mae)

## get ranges and add "within the first 2000 bp outside of the genes"
rr <- rowRanges(mae[["ACC_RNASeq2GeneNorm-20160128_ranged"]])
rr2000 <- flank(rr, width=2000, both=TRUE)

## summarize non-silent mutations per flanked range
nonsilent <- function(scores, ranges, qranges) any(scores != "Silent")
mutations <- RaggedExperiment::qreduceAssay(mae[["ACC_Mutation-20160128"]], rr2000, nonsilent, "Variant_Classification")
rownames(mutations) <- names(rr)
mutations[is.na(mutations)] <- 0
mae <- c(mae, mutations=SummarizedExperiment(mutations))
rm(mutations, rr, rr2000)

# select the elements for analysis
mae2 <- mae[, , c("ACC_RNASeq2GeneNorm-20160128_ranged", "has.meth_ranged", "mutations")]

saveRDS(mae2, file="ACC.rds")

mae3 <- intersectRows(mae2)
mae3 <- MatchedAssayExperiment(mae3)
keep <- assay(mae3, "ACC_RNASeq2GeneNorm-20160128_ranged") > 3 & 
  assay(mae3, "has.meth_ranged") &
  assay(mae3, "mutations")

summary(colSums(keep))
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.000   5.000   8.000   8.961  11.000  69.000 

sort(colSums(keep), decreasing = TRUE)[1:round(ncol(keep)*0.05)]
# TCGA-PK-A5HB-01A-11R-A29S-07 TCGA-OR-A5LJ-01A-11R-A29S-07 
# 69                           29 
# TCGA-OR-A5J5-01A-11R-A29S-07 TCGA-OR-A5JA-01A-11R-A29S-07 
# 26                           22 

# > proc.time() with traditional methylation storage
#   almost all disk access (magnetic drive on my laptop), only ~4 seconds system time
#    user  system elapsed 
# 208.178   4.299 220.584 

# > proc.time() with HDF5 methylation storage
#   user  system elapsed 
# 35.139   2.641  48.287