BioinformaticsFMRP / TCGAbiolinks

TCGAbiolinks
http://bioconductor.org/packages/devel/bioc/vignettes/TCGAbiolinks/inst/doc/index.html
295 stars 112 forks source link

Error: "Subscript out of bounds" when filtering for specific sample type #462

Closed andysontran closed 3 years ago

andysontran commented 3 years ago

Hi,

I am encountering an error where I am unable to filter for a specific sample type (i.e. Primary Tumor) using data from TCGAquery_SampleTypes for certain TCGA projects (i.e. TCGA-BRCA) on recount2. I believe the datasets have been updated to add/remove samples and the barcodes extracted from TCGAquery_SampleTypes may not have been updated to reflect those changes?

` ##------------ DATA PREP ------------## recount.gtex <- TCGAquery_recount2(project = "GTEX", tissue = "breast") downloading Range Summarized Experiment for: breast recount.tcga <- TCGAquery_recount2(project = "TCGA", tissue = "breast") downloading Range Summarized Experiment for: breast

SE.recount.gtex <- recount.gtex[[GTEX_breast]] SE.recount.tcga <- recount.tcga[[TCGA_breast]] query <- GDCquery(project = TCGA-BRCA, data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "HTSeq - Counts")

samplesDown <- getResults(query, cols = c("cases")) dataSmTP <- TCGAquery_SampleTypes(barcode = samplesDown, typesample = "TP")

eset.gtex <- assays(scale_counts(recount.gtex[[GTEX_breast]], round = TRUE))$counts eset.tcga <- assays(scale_counts(recount.tcga[[TCGA_breast]], round = TRUE))$counts

rse_scaled <- scale_counts(recount.gtex[[GTEX_breast]], round = TRUE)

colDat <- recount.tcga[[TCGA_breast, exact = FALSE]] colnames(eset.tcga) <- colDat$gdc_cases.samples.portions.analytes.aliquots.submitter_id rownames(eset.gtex) <- gsub("\..", "", rownames(eset.gtex)) rownames(eset.tcga) <- gsub("\..", "", rownames(eset.tcga))

eset.tcga.cancer <- eset.tcga[,which(colData(recount.tcga[[TCHA_breast]])$gdc_cases.samples.sample_type=="Primary Tumor")] eset.tcga.cancer <- eset.tcga.cancer[,c(dataSmTP)] Error in eset.tcga.cancer[, c(dataSmTP)] : subscript out of bounds `

Thank you in advance for your help with this issue!

A

tiagochst commented 3 years ago

For some reason, there were missing 3 samples. "TCGA-BH-A0B2-01A-11R-A10J-07" "TCGA-A7-A0DC-01A-11R-A00Z-07" "TCGA-A7-A0DC-01B-04R-A22O-07"

library(SummarizedExperiment)
library(TCGAbiolinks)
library(recount)
recount.gtex <- TCGAquery_recount2(project = "GTEX", tissue = "breast")
recount.tcga <- TCGAquery_recount2(project = "TCGA", tissue = "breast")

SE.recount.gtex <- recount.gtex[["GTEX_breast"]]
SE.recount.tcga <- recount.tcga[["TCGA_breast"]]
query <- GDCquery(
    project = "TCGA-BRCA",
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification",
    workflow.type = "HTSeq - Counts"
  )

samplesDown <- getResults(query, cols = c("cases"))
dataSmTP <- TCGAquery_SampleTypes(barcode = samplesDown, typesample = "TP")

eset.gtex <- assays(scale_counts(recount.gtex[["GTEX_breast"]], round = TRUE))$counts
eset.tcga <- assays(scale_counts(recount.tcga[["TCGA_breast"]], round = TRUE))$counts

rse_scaled <- scale_counts(recount.gtex[["GTEX_breast"]], round = TRUE)

colDat <- recount.tcga[["TCGA_breast", exact = FALSE]]
colnames(eset.tcga) <- colDat$gdc_cases.samples.portions.analytes.aliquots.submitter_id
rownames(eset.gtex) <- gsub("\\.[0-9]*$", "", rownames(eset.gtex))
rownames(eset.tcga) <- gsub("\\.[0-9]*$", "", rownames(eset.tcga))

eset.tcga.cancer <-  eset.tcga[, which(colData(recount.tcga[["TCGA_breast"]])$gdc_cases.samples.sample_type ==  "Primary Tumor")]
eset.tcga.cancer <- eset.tcga.cancer[, colnames(eset.tcga.cancer) %in% c(dataSmTP)]