Open komalsrathi opened 2 years ago
ASCAT copy number values are also new for me.
You should be able to get cytoband information with biomaRt
library(biomaRt)
ensembl <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl")
attributes <- c(
"chromosome_name",
"start_position",
"end_position",
"strand",
"ensembl_gene_id",
"entrezgene_id",
"external_gene_name",
"band"
)
gene.info <- getBM(
attributes = attributes,
mart = ensembl
)
gene.info <- gene.info[gene.info$band != "",]
Thanks @tiagochst - do you mind if I keep this ticket open just in case you or someone else can figure it out in the near future?
@komalsrathi Sure!
@komalsrathi Did you try posting on Biostar? I recall reading this one here: https://www.biostars.org/p/494728/, but I still need to read the article about the new algorithms.
@tiagochst I have not, but that is a good idea. What I have done right now is 1) get total copy number like above 2) get ploidy from clinical data and used this reference to get status: https://cancer.sanger.ac.uk/cosmic/help/cnv/overview. I am not sure if this is the correct way to do it so maybe I'll post my strategy on Biostars for feedback.
suppressPackageStartupMessages({
library(TCGAbiolinks)
library(tidyverse)
library(biomaRt)
library(SummarizedExperiment)
})
# get copy number from GDC (this example is using TCGA-GBM but this will be generalized within the reporting code)
query <- GDCquery(project = "TCGA-GBM",
data.category = "Copy Number Variation",
data.type = "Gene Level Copy Number")
GDCdownload(query)
# get ploidy info from clinical data
cnv_data <- GDCprepare(query, summarizedExperiment = T)
clin_data <- SummarizedExperiment::colData(cnv_data) %>%
as.data.frame() %>%
mutate(sample_id = paste0(sample_submitter_id, "-", sample_type_id),
barcode = gsub(",.*", "", barcode)) %>%
dplyr::select(barcode, sample_id, paper_ABSOLUTE.ploidy) %>%
filter(!is.na(paper_ABSOLUTE.ploidy))
# get absolute copy number scores (ASCAT)
cnv_data <- GDCprepare(query, summarizedExperiment = F)
# there are values with min_copy_number and max_copy_number but I am not using them
cnv_data <- cnv_data %>%
dplyr::select(-c(grep('_min_copy_number|_max_copy_number', colnames(cnv_data))))
# format column names to only keep barcode
colnames(cnv_data) <- gsub(",.*", "", colnames(cnv_data))
# convert to long format
cnv_data <- cnv_data %>%
dplyr::select(-c(chromosome, start, end)) %>%
dplyr::rename("hgnc_symbol" = "gene_name",
"ensembl" = "gene_id") %>%
gather('barcode', 'copy_number', -c("hgnc_symbol", "ensembl")) %>%
filter(!is.na(copy_number))
cnv_data <- cnv_data %>%
inner_join(clin_data, by = "barcode")
# map status using absolute copy number and ploidy
# taken from https://cancer.sanger.ac.uk/cosmic/help/cnv/overview
cnv_data <- cnv_data %>%
dplyr::rename("ploidy" = "paper_ABSOLUTE.ploidy") %>%
mutate(status = ifelse(test = (ploidy <= 2.7 & copy_number >= 5) | (ploidy > 2.7 & copy_number >= 9),
yes = "Gain",
no = ifelse(test = (ploidy <= 2.7 & copy_number == 0) | (ploidy > 2.7 & copy_number < (ploidy-2.7)),
yes = "Loss",
no = "Neutral")))
# map cytoband information to gene symbols
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
my_regions <- getBM(c("hgnc_symbol", "band"),
filters = c("hgnc_symbol"),
values = list(hgnc_symbol = unique(cnv_data$hgnc_symbol)),
mart = ensembl)
my_regions <- my_regions %>%
filter(band != "") %>%
dplyr::rename("cytoband" = "band")
cnv_data <- cnv_data %>%
left_join(my_regions, by = "hgnc_symbol")
@tiagochst I have not, but that is a good idea. What I have done right now is 1) get total copy number like above 2) get ploidy from clinical data and used this reference to get status: https://cancer.sanger.ac.uk/cosmic/help/cnv/overview. I am not sure if this is the correct way to do it so maybe I'll post my strategy on Biostars for feedback.
suppressPackageStartupMessages({ library(TCGAbiolinks) library(tidyverse) library(biomaRt) library(SummarizedExperiment) }) # get copy number from GDC (this example is using TCGA-GBM but this will be generalized within the reporting code) query <- GDCquery(project = "TCGA-GBM", data.category = "Copy Number Variation", data.type = "Gene Level Copy Number") GDCdownload(query) # get ploidy info from clinical data cnv_data <- GDCprepare(query, summarizedExperiment = T) clin_data <- SummarizedExperiment::colData(cnv_data) %>% as.data.frame() %>% mutate(sample_id = paste0(sample_submitter_id, "-", sample_type_id), barcode = gsub(",.*", "", barcode)) %>% dplyr::select(barcode, sample_id, paper_ABSOLUTE.ploidy) %>% filter(!is.na(paper_ABSOLUTE.ploidy)) # get absolute copy number scores (ASCAT) cnv_data <- GDCprepare(query, summarizedExperiment = F) # there are values with min_copy_number and max_copy_number but I am not using them cnv_data <- cnv_data %>% dplyr::select(-c(grep('_min_copy_number|_max_copy_number', colnames(cnv_data)))) # format column names to only keep barcode colnames(cnv_data) <- gsub(",.*", "", colnames(cnv_data)) # convert to long format cnv_data <- cnv_data %>% dplyr::select(-c(chromosome, start, end)) %>% dplyr::rename("hgnc_symbol" = "gene_name", "ensembl" = "gene_id") %>% gather('barcode', 'copy_number', -c("hgnc_symbol", "ensembl")) %>% filter(!is.na(copy_number)) cnv_data <- cnv_data %>% inner_join(clin_data, by = "barcode") # map status using absolute copy number and ploidy # taken from https://cancer.sanger.ac.uk/cosmic/help/cnv/overview cnv_data <- cnv_data %>% dplyr::rename("ploidy" = "paper_ABSOLUTE.ploidy") %>% mutate(status = ifelse(test = (ploidy <= 2.7 & copy_number >= 5) | (ploidy > 2.7 & copy_number >= 9), yes = "Gain", no = ifelse(test = (ploidy <= 2.7 & copy_number == 0) | (ploidy > 2.7 & copy_number < (ploidy-2.7)), yes = "Loss", no = "Neutral"))) # map cytoband information to gene symbols ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") my_regions <- getBM(c("hgnc_symbol", "band"), filters = c("hgnc_symbol"), values = list(hgnc_symbol = unique(cnv_data$hgnc_symbol)), mart = ensembl) my_regions <- my_regions %>% filter(band != "") %>% dplyr::rename("cytoband" = "band") cnv_data <- cnv_data %>% left_join(my_regions, by = "hgnc_symbol")
Thank you very much for your answer,but I want to konw how did you handle it the duplicated samples duplicated?such as download the PAAD Copy Number Variation,I get the following error:
cnv_data <- GDCprepare(query, summarizedExperiment = T) cases experimental_strategy analysis_workflow_type 25 TCGA-HZ-A9TJ-10A-01D-A40V-01 Genotyping Array ASCAT2 26 TCGA-HZ-A9TJ-10A-01D-A40V-01 Genotyping Array ASCAT2 Error in GDCprepare(query, summarizedExperiment = T) : There are samples duplicated. We will not be able to prepare it
Any help would be greatly appreciated!
@xiaolan552
The following code is working for me. Probably you need to update TCGAbiolinks with BiocManager::install("BioinformaticsFMRP/TCGAbiolinks")
query <- GDCquery(
project = "TCGA-PAAD",
data.category = "Copy Number Variation",
data.type = "Gene Level Copy Number"
)
GDCdownload(query)
data <- GDCprepare(query)
The two samples are below.
grep("TCGA-HZ-A9TJ-",query$results[[1]]$cases,value = T) [1] "TCGA-HZ-A9TJ-06A-11D-A40V-01,TCGA-HZ-A9TJ-10A-01D-A40V-01" [2] "TCGA-HZ-A9TJ-01A-11D-A40V-01,TCGA-HZ-A9TJ-10A-01D-A40V-01"
@tiagochst Thank you very much!! I'll try it right away。
Before GDC made the recent changes, I was able to get GISTIC scores with values such as -1, 0 and +1 using
Gene Level Copy Number Scores
. Now with the ASCAT copy number values, I am not sure how to get information on gain, loss, deletion and amplification. The new data also does not have cytoband info like before. Any ideas on how to get 1)cnv status
and 2)cytoband
information?Any help would be much appreciated. Thanks!!