Open allisonvuong opened 3 years ago
Thank you for your interest, Allison. An MAE would be very much appreciated indeed. If you have time to send a PR, that would be great. Note that the data are disseminated through ExperimentHub, so I assume we would use the same mechanism for the MAE.
Hi,
Okay great, I will do that!
Best, Allison
I have created a script to convert most of the DepMap datasets to a MultiAssayExperiment
object. However, I'm a bit stuck coercing the mutationCalls
dataset to a RangedSummarizedExperiment
Does anyone have any suggested solutions?
## Create MultiAssayExperiment object out of the most current Depmap datasets
# TODO:
# 1) convert the mutationCalls to a RangedSummarizedExperiment
# 2) make MAE out of all datesets
## load libraries
# BiocManager::install(
# c("MultiAssayExperiment", "SummarizedExperiment", "S4Vectors", "dplyr",
# "tidyr", "tibble", "readr", "ExperimentHub", "depmap"))
library("MultiAssayExperiment")
library("SummarizedExperiment")
library("S4Vectors")
library("dplyr")
library("tidyr")
library("tibble")
library("readr")
library("ExperimentHub")
library("depmap")
Sys.setenv(VROOM_CONNECTION_SIZE = 500072)
## output file path
file_path <- "media/seq-srv-05/vrc/Project/Project_Theo" ## server
#file_path <- "~/tmp/multiomics/data/" ## local
#################### depmap `metadata_21Q4` dataset ############################
## "metadata" dataset will serve as MAE colData
readr::read_csv("https://ndownloader.figshare.com/files/31316011") %>%
as.data.frame() -> metadata_21Q4
names(metadata_21Q4)[1:26] <- c(
"depmap_id", "cell_line_name", "stripped_cell_line_name", "cell_line",
"aliases", "cosmic_id", "sex", "source", "Achilles_n_replicates",
"cell_line_NNMD", "culture_type", "culture_medium", "cas9_activity",
"RRID", "WTSI_master_cell_ID", "sample_collection_site",
"primary_or_metastasis", "primary_disease", "subtype_disease", "age",
"sanger_id", "additional_info", "lineage", "lineage_subtype",
"lineage_sub_subtype", "lineage_molecular_subtype")
rownames(metadata_21Q4) <- metadata_21Q4$depmap_id
######################## depmap dep_2_name ###################################
### `dep_2_name` to add `depmap_id` or `cell_line` to other datasets
metadata_21Q4 %>% dplyr::select(depmap_id, cell_line) -> dep_2_name_21Q4
###################### depmap `mutationCalls_21Q4` dataset #####################
mutationCalls_21Q4 <- readr::read_csv("https://ndownloader.figshare.com/files/31315930")
names(mutationCalls_21Q4)[1:32] <- c(
"gene_name", "entrez_id", "ncbi_build", "chromosome", "start_pos",
"end_pos", "strand", "var_class", "var_type", "ref_allele",
"tumor_seq_allele1", "dbSNP_RS", "dbSNP_val_status", "genome_change",
"annotation_trans", "depmap_id", "cDNA_change", "codon_change",
"protein_change", "is_deleterious", "is_tcga_hotspot", "tcga_hsCnt",
"is_cosmic_hotspot", "cosmic_hsCnt", "ExAC_AF", "var_annotation",
"CGA_WES_AC", "HC_AC", "RD_AC", "RNAseq_AC", "sanger_WES_AC", "WGS_AC")
mutationCalls_21Q4 %>% dplyr::select(depmap_id, everything()) -> mutationCalls_21Q4
######################## depmap `copyNumber_21Q4` dataset ######################
readr::read_csv("https://ndownloader.figshare.com/files/31315924") %>%
as.data.frame() %>% dplyr::rename(depmap_id = names(.)[1]) -> copyNumber_21Q4
names(copyNumber_21Q4) <- gsub("&", ";", sub(" \\(.+\\)$", "", names(copyNumber_21Q4)))
copyNumber_21Q4 %>%
tibble::column_to_rownames(var = "depmap_id") %>%
t() %>% as.data.frame() -> copyNumber_21Q4_mat
## build SE object
data.frame(Samples = names(copyNumber_21Q4_mat)) -> copyNumber_colData
rownames(copyNumber_colData) <- copyNumber_colData$Samples
SummarizedExperiment(assays = copyNumber_21Q4_mat,
colData = copyNumber_colData) -> copyNumber_se
######################## depmap `crispr_21Q4` dataset ##########################
readr::read_csv("https://ndownloader.figshare.com/files/31315996") %>%
as.data.frame() %>% dplyr::rename(depmap_id = names(.)[1]) -> crispr_21Q4
names(crispr_21Q4) <- gsub("&", ";", sub(" \\(.+\\)$", "", names(crispr_21Q4)))
crispr_21Q4 %>%
tibble::column_to_rownames(var = "depmap_id") %>%
t() %>% as.data.frame() -> crispr_21Q4_mat
## build SE object
data.frame(Samples = names(crispr_21Q4_mat)) -> crispr_colData
rownames(crispr_colData) <- crispr_colData$Samples
SummarizedExperiment(assays = crispr_21Q4_mat,
colData = crispr_colData) -> crispr_se
######################## depmap `TPM_21Q4` dataset #############################
readr::read_csv("https://ndownloader.figshare.com/files/31315882") %>%
as.data.frame() %>% dplyr::rename(depmap_id = names(.)[1]) -> TPM_21Q4
names(TPM_21Q4) <- gsub("&", ";", sub(" \\(.+\\)$", "", names(TPM_21Q4)))
TPM_21Q4 %>%
tibble::column_to_rownames(var = "depmap_id") %>%
t() %>% as.data.frame() -> TPM_21Q4_mat
## build SE object
data.frame(Samples = names(TPM_21Q4_mat)) -> TPM_colData
rownames(TPM_colData) <- TPM_colData$Samples
SummarizedExperiment(assays = TPM_21Q4_mat,
colData = TPM_colData) -> TPM_se
##################### depmap `RPPA_19Q3` dataset ###############################
## download the 19Q3 metadata
eh <- ExperimentHub()
query(eh, "depmap")
eh[["EH3086"]] -> metadata_19Q3
metadata_19Q3 %>% dplyr::select(depmap_id, cell_line) -> dep_2_name_19Q3
### loading data (downloading .csv file from online source)
readr::read_csv(
paste0("https://depmap.org/portal/download/api/download?file_name=ccle%2Fccle",
"_2019%2FCCLE_RPPA_20181003.csv&bucket=depmap-external-downloads")) %>%
dplyr::rename(cell_line = names(.)[1]) %>%
dplyr::left_join(dep_2_name_19Q3, by = "cell_line") %>%
dplyr::select(depmap_id, cell_line, everything()) -> RPPA_19Q3
## fill in NA depmap ID value (source):
## https://web.expasy.org/cellosaurus/CVCL_9980 # NCIH684_LIVER
## https://web.expasy.org/cellosaurus/CVCL_3386 # KE97_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE
RPPA_19Q3$depmap_id[RPPA_19Q3$cell_line == "NCIH684_LIVER"] <- "ACH-000089"
RPPA_19Q3$depmap_id[RPPA_19Q3$cell_line == "KE97_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE"] <- "ACH-000167"
RPPA_19Q3 %>% dplyr::select(-cell_line) %>%
tibble::column_to_rownames(var = "depmap_id") %>%
t() %>% as.data.frame() -> RPPA_19Q3_mat
## build SE object
data.frame(Samples = names(RPPA_19Q3_mat)) -> RPPA_colData
rownames(RPPA_colData) <- RPPA_colData$Samples
SummarizedExperiment(assays = RPPA_19Q3_mat,
colData = RPPA_colData) -> RPPA_se
##################### depmap `proteomic_20Q2` dataset ##########################
read_csv("https://gygi.hms.harvard.edu/data/ccle/protein_quant_current_normalized.csv.gz") %>%
dplyr::select(Gene_Symbol, Uniprot_Acc, MDAMB468_BREAST_TenPx01:last_col()) -> proteomic_20Q2
## manually add gene names that are NA (these were taken from Uniprot)
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "H7BZ55"] <- "CROCC2" # 270
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "A6NL28"] <- "LOC101929943" # 576
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "Q9H552"] <- "KRT8P11" # 1054
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "S4R362"] <- "SEPTIN1" #3233
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "Q96FF7"] <- "MISP3" # 4833
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "Q2M2H8"] <- "MGAM2" # 7933
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "P01614"] <- "IGKV2D-40" # 8523
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "P01619"] <- "IGKV3-20" # 8623
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "Q69YL0"] <- "NCBP2AS2" # 10005
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "I3L1I5"] <- "ARHGEF18" # 10487
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "E9PS84"] <- "AP000783.1" # 10676
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "P60507"] <- "ERVFC1" # 10822
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "H3BQF6"] <- "RP11-12J10.3" # 11313
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "C9J7I0"] <- "UMAD1" # 11632
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "Q902F9"] <- "HERVK_113" # 11691
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "P63135"] <- "ERVK-7" # 11692
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "H3BMH7"] <- "RP11-178L8.4" # 11735
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "P01780"] <- "IGHV3-7" # 12070
proteomic_20Q2 %>%
dplyr::filter(!is.na(Gene_Symbol)) %>%
dplyr::filter(!duplicated(Gene_Symbol)) %>%
tibble::column_to_rownames(var = "Gene_Symbol") %>%
dplyr::select(-Uniprot_Acc) -> proteomic_20Q2
names(proteomic_20Q2) <- gsub('(.*)_\\w+', '\\1', names(proteomic_20Q2))
proteomic_20Q2 %>%
t() %>% as.data.frame() %>%
tibble::rownames_to_column() %>%
dplyr::rename(cell_line = rowname) %>%
dplyr::left_join(dep_2_name_21Q4, by = "cell_line") %>%
dplyr:: select(cell_line, depmap_id, everything()) -> proteomic_20Q2
## manually add depmap_ids that are NA (these were taken from CCLE)
proteomic_20Q2$depmap_id[proteomic_20Q2$cell_line == "SW948_LARGE_INTESTINE.1"] <- "ACH-000680"
proteomic_20Q2$depmap_id[proteomic_20Q2$cell_line == "CAL120_BREAST.1"] <- "ACH-000212"
proteomic_20Q2$depmap_id[proteomic_20Q2$cell_line == "X8505C_THYROID"] <- "ACH-001307"
proteomic_20Q2$depmap_id[proteomic_20Q2$cell_line == "HCT15_LARGE_INTESTINE.1"] <- "ACH-000997"
proteomic_20Q2$depmap_id[proteomic_20Q2$cell_line == "SW948_LARGE_INTESTINE"] <- "ACH-000680"
proteomic_20Q2 %>%
dplyr::filter(!is.na(depmap_id)) %>%
dplyr::filter(!duplicated(depmap_id)) %>%
dplyr::select(-cell_line) %>%
tibble::column_to_rownames(var = "depmap_id") %>%
t() %>% as.data.frame() -> proteomic_20Q2_mat
data.frame(Samples = names(proteomic_20Q2_mat),
row.names = proteomic_colData$Samples) -> proteomic_colData
## build SE object
SummarizedExperiment(assays = proteomic_20Q2_mat,
colData = proteomic_colData) -> proteomic_se
#################### depmap `drug_dependency_21Q4` dataset #####################
# read_csv("https://ndownloader.figshare.com/files/17741420") %>%
# as.data.frame() %>% dplyr::rename(depmap_id = names(.)[1]) %>%
# dplyr::mutate(depmap_id = gsub("_FAILED_STR", "", depmap_id)) %>%
# tibble::column_to_rownames(var = "depmap_id") %>%
# as.matrix() %>% t() -> drug_dep_19Q3_w
#################### depmap `drug_dependency_21Q4` metadata ####################
## data cleaning of `drug_sensativity` dataset and screen info
# read_csv("https://ndownloader.figshare.com/files/20237715") %>%
# rename(compound = column_name) -> screen_info
# eh[["EH3087"]] -> drug_dep_19Q3
############################# build depmap MAE #################################
## Combine to a named list and call the ExperimentList constructor function
list(crispr_21Q4 = crispr_21Q4_se,
copyNumber_21Q4 = copyNumber_21Q4_se,
TPM_21Q4 = TPM_21Q4_se,
proteomic_20Q2 = proteomic_20Q2_se,
# mutationCalls_21Q4 = mutationCalls_21Q4_mat,
# drug_dep_19Q3 = drug_dep_19Q3,
RPPA_19Q3 = RPPA_19Q3_se) -> assayList
## Use the ExperimentList constructor
exp_list <- ExperimentList(assayList)
## build colData, a DataFrame describing the characteristics of biological units
mae_coldata <- DataFrame(metadata_21Q4)
## build MAE
MultiAssayExperiment::MultiAssayExperiment(
experiments = exp_list,
colData = mae_coldata #,
#sampleMap = sample_map
) -> depmap_mae
# saveRDS(depmap_mae, file = paste0(file_path, "depmap_mae.rds"))
Hi,
Thank you for this very useful package. As many of the CCLE lines have now been screened in Achilles experiments, I'm wondering whether any thought has been given to harmonizing and serving this data as a MultiAssayExperiment. This would enable the user to subset the MAE by a set of cell lines of interest, and easily retrieve all omics data pertaining to that subset of lines.
If you think this is a good idea, I am also happy to contribute an initial PR if more hands would be appreciated.
Best, Allison