UCLouvain-CBIO / depmap

Cancer Dependency Map package
https://uclouvain-cbio.github.io/depmap/
24 stars 7 forks source link

serve data as MultiAssayExperiment? #60

Open allisonvuong opened 3 years ago

allisonvuong commented 3 years ago

Hi,

Thank you for this very useful package. As many of the CCLE lines have now been screened in Achilles experiments, I'm wondering whether any thought has been given to harmonizing and serving this data as a MultiAssayExperiment. This would enable the user to subset the MAE by a set of cell lines of interest, and easily retrieve all omics data pertaining to that subset of lines.

If you think this is a good idea, I am also happy to contribute an initial PR if more hands would be appreciated.

Best, Allison

lgatto commented 3 years ago

Thank you for your interest, Allison. An MAE would be very much appreciated indeed. If you have time to send a PR, that would be great. Note that the data are disseminated through ExperimentHub, so I assume we would use the same mechanism for the MAE.

allisonvuong commented 3 years ago

Hi,

Okay great, I will do that!

Best, Allison

lgatto commented 3 years ago

See also this comment

tfkillian commented 2 years ago

I have created a script to convert most of the DepMap datasets to a MultiAssayExperiment object. However, I'm a bit stuck coercing the mutationCalls dataset to a RangedSummarizedExperiment Does anyone have any suggested solutions?

## Create MultiAssayExperiment object out of the most current Depmap datasets

# TODO:
# 1) convert the mutationCalls to a RangedSummarizedExperiment
# 2) make MAE out of all datesets

## load libraries
# BiocManager::install(
#   c("MultiAssayExperiment", "SummarizedExperiment", "S4Vectors", "dplyr",
#     "tidyr", "tibble", "readr", "ExperimentHub", "depmap"))

library("MultiAssayExperiment")
library("SummarizedExperiment")
library("S4Vectors")
library("dplyr")
library("tidyr")
library("tibble")
library("readr")
library("ExperimentHub")
library("depmap")

Sys.setenv(VROOM_CONNECTION_SIZE = 500072)

## output file path
file_path <- "media/seq-srv-05/vrc/Project/Project_Theo" ## server
#file_path <- "~/tmp/multiomics/data/" ## local

#################### depmap `metadata_21Q4` dataset ############################
## "metadata" dataset will serve as MAE colData
readr::read_csv("https://ndownloader.figshare.com/files/31316011") %>% 
  as.data.frame() -> metadata_21Q4
names(metadata_21Q4)[1:26] <- c(
    "depmap_id", "cell_line_name", "stripped_cell_line_name", "cell_line",
    "aliases", "cosmic_id", "sex", "source", "Achilles_n_replicates",
    "cell_line_NNMD", "culture_type", "culture_medium", "cas9_activity",
    "RRID", "WTSI_master_cell_ID", "sample_collection_site",
    "primary_or_metastasis", "primary_disease", "subtype_disease", "age",
    "sanger_id", "additional_info", "lineage", "lineage_subtype",
    "lineage_sub_subtype", "lineage_molecular_subtype")
rownames(metadata_21Q4) <- metadata_21Q4$depmap_id

######################## depmap dep_2_name  ###################################
### `dep_2_name` to add `depmap_id` or `cell_line` to other datasets
metadata_21Q4 %>% dplyr::select(depmap_id, cell_line) -> dep_2_name_21Q4

###################### depmap `mutationCalls_21Q4` dataset #####################
mutationCalls_21Q4 <- readr::read_csv("https://ndownloader.figshare.com/files/31315930")
names(mutationCalls_21Q4)[1:32] <- c(
    "gene_name", "entrez_id", "ncbi_build", "chromosome", "start_pos",
    "end_pos", "strand", "var_class", "var_type", "ref_allele",
    "tumor_seq_allele1", "dbSNP_RS", "dbSNP_val_status", "genome_change",
    "annotation_trans", "depmap_id", "cDNA_change", "codon_change",
    "protein_change", "is_deleterious", "is_tcga_hotspot", "tcga_hsCnt",
    "is_cosmic_hotspot", "cosmic_hsCnt", "ExAC_AF", "var_annotation",
    "CGA_WES_AC", "HC_AC", "RD_AC", "RNAseq_AC", "sanger_WES_AC", "WGS_AC")
mutationCalls_21Q4 %>% dplyr::select(depmap_id, everything()) -> mutationCalls_21Q4

######################## depmap `copyNumber_21Q4` dataset ######################
readr::read_csv("https://ndownloader.figshare.com/files/31315924") %>% 
  as.data.frame() %>% dplyr::rename(depmap_id = names(.)[1]) -> copyNumber_21Q4
names(copyNumber_21Q4) <- gsub("&", ";", sub(" \\(.+\\)$", "", names(copyNumber_21Q4)))
copyNumber_21Q4 %>%
  tibble::column_to_rownames(var = "depmap_id") %>%
  t() %>% as.data.frame() -> copyNumber_21Q4_mat

## build SE object
data.frame(Samples = names(copyNumber_21Q4_mat)) -> copyNumber_colData
rownames(copyNumber_colData) <- copyNumber_colData$Samples
SummarizedExperiment(assays = copyNumber_21Q4_mat,
                     colData = copyNumber_colData) -> copyNumber_se

######################## depmap `crispr_21Q4` dataset ##########################
readr::read_csv("https://ndownloader.figshare.com/files/31315996") %>%
    as.data.frame() %>% dplyr::rename(depmap_id = names(.)[1]) -> crispr_21Q4
names(crispr_21Q4) <- gsub("&", ";", sub(" \\(.+\\)$", "", names(crispr_21Q4)))
crispr_21Q4 %>%
  tibble::column_to_rownames(var = "depmap_id") %>%
  t() %>% as.data.frame() -> crispr_21Q4_mat

## build SE object
data.frame(Samples = names(crispr_21Q4_mat)) -> crispr_colData
rownames(crispr_colData) <- crispr_colData$Samples
SummarizedExperiment(assays = crispr_21Q4_mat,
                     colData = crispr_colData) -> crispr_se

######################## depmap `TPM_21Q4` dataset #############################
readr::read_csv("https://ndownloader.figshare.com/files/31315882") %>%
  as.data.frame() %>% dplyr::rename(depmap_id = names(.)[1]) -> TPM_21Q4
names(TPM_21Q4) <- gsub("&", ";", sub(" \\(.+\\)$", "", names(TPM_21Q4)))
TPM_21Q4 %>%
  tibble::column_to_rownames(var = "depmap_id") %>%
  t() %>% as.data.frame() -> TPM_21Q4_mat

## build SE object
data.frame(Samples = names(TPM_21Q4_mat)) -> TPM_colData
rownames(TPM_colData) <- TPM_colData$Samples
SummarizedExperiment(assays = TPM_21Q4_mat,
                     colData = TPM_colData) -> TPM_se

##################### depmap `RPPA_19Q3` dataset ###############################
## download the 19Q3 metadata
eh <- ExperimentHub()
query(eh, "depmap")
eh[["EH3086"]] -> metadata_19Q3
metadata_19Q3 %>% dplyr::select(depmap_id, cell_line) -> dep_2_name_19Q3

### loading data (downloading .csv file from online source)
readr::read_csv(
  paste0("https://depmap.org/portal/download/api/download?file_name=ccle%2Fccle",
         "_2019%2FCCLE_RPPA_20181003.csv&bucket=depmap-external-downloads")) %>% 
  dplyr::rename(cell_line = names(.)[1]) %>% 
  dplyr::left_join(dep_2_name_19Q3, by = "cell_line") %>% 
  dplyr::select(depmap_id, cell_line, everything()) -> RPPA_19Q3

## fill in NA depmap ID value (source):
## https://web.expasy.org/cellosaurus/CVCL_9980 # NCIH684_LIVER
## https://web.expasy.org/cellosaurus/CVCL_3386 # KE97_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE
RPPA_19Q3$depmap_id[RPPA_19Q3$cell_line == "NCIH684_LIVER"] <- "ACH-000089"
RPPA_19Q3$depmap_id[RPPA_19Q3$cell_line == "KE97_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE"] <- "ACH-000167"
RPPA_19Q3 %>% dplyr::select(-cell_line) %>% 
  tibble::column_to_rownames(var = "depmap_id") %>%
  t() %>% as.data.frame() -> RPPA_19Q3_mat

## build SE object
data.frame(Samples = names(RPPA_19Q3_mat)) -> RPPA_colData
rownames(RPPA_colData) <- RPPA_colData$Samples
SummarizedExperiment(assays = RPPA_19Q3_mat,
                     colData = RPPA_colData) -> RPPA_se

##################### depmap `proteomic_20Q2` dataset ##########################
read_csv("https://gygi.hms.harvard.edu/data/ccle/protein_quant_current_normalized.csv.gz") %>% 
  dplyr::select(Gene_Symbol, Uniprot_Acc, MDAMB468_BREAST_TenPx01:last_col()) -> proteomic_20Q2

## manually add gene names that are NA (these were taken from Uniprot)
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "H7BZ55"] <- "CROCC2" # 270
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "A6NL28"] <- "LOC101929943" # 576
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "Q9H552"] <- "KRT8P11" # 1054
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "S4R362"] <- "SEPTIN1" #3233
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "Q96FF7"] <- "MISP3" # 4833
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "Q2M2H8"] <- "MGAM2" # 7933
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "P01614"] <- "IGKV2D-40" # 8523
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "P01619"] <- "IGKV3-20" # 8623
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "Q69YL0"] <- "NCBP2AS2" # 10005
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "I3L1I5"] <- "ARHGEF18" # 10487
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "E9PS84"] <- "AP000783.1" # 10676
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "P60507"] <- "ERVFC1" # 10822
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "H3BQF6"] <- "RP11-12J10.3" # 11313
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "C9J7I0"] <- "UMAD1" # 11632
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "Q902F9"] <- "HERVK_113" # 11691
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "P63135"] <- "ERVK-7" # 11692
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "H3BMH7"] <- "RP11-178L8.4" # 11735
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "P01780"] <- "IGHV3-7" # 12070

proteomic_20Q2 %>%
  dplyr::filter(!is.na(Gene_Symbol)) %>%
  dplyr::filter(!duplicated(Gene_Symbol)) %>%
  tibble::column_to_rownames(var = "Gene_Symbol") %>% 
  dplyr::select(-Uniprot_Acc) -> proteomic_20Q2
names(proteomic_20Q2) <- gsub('(.*)_\\w+', '\\1', names(proteomic_20Q2))

proteomic_20Q2 %>% 
  t() %>% as.data.frame() %>%
  tibble::rownames_to_column() %>%
  dplyr::rename(cell_line = rowname) %>% 
  dplyr::left_join(dep_2_name_21Q4, by = "cell_line") %>% 
  dplyr:: select(cell_line, depmap_id, everything()) -> proteomic_20Q2

## manually add depmap_ids that are NA (these were taken from CCLE)
proteomic_20Q2$depmap_id[proteomic_20Q2$cell_line == "SW948_LARGE_INTESTINE.1"] <- "ACH-000680" 
proteomic_20Q2$depmap_id[proteomic_20Q2$cell_line == "CAL120_BREAST.1"] <- "ACH-000212" 
proteomic_20Q2$depmap_id[proteomic_20Q2$cell_line == "X8505C_THYROID"] <- "ACH-001307" 
proteomic_20Q2$depmap_id[proteomic_20Q2$cell_line == "HCT15_LARGE_INTESTINE.1"] <- "ACH-000997"
proteomic_20Q2$depmap_id[proteomic_20Q2$cell_line == "SW948_LARGE_INTESTINE"] <- "ACH-000680"

proteomic_20Q2 %>%
  dplyr::filter(!is.na(depmap_id)) %>%
  dplyr::filter(!duplicated(depmap_id)) %>%
  dplyr::select(-cell_line) %>%
  tibble::column_to_rownames(var = "depmap_id") %>%
  t() %>% as.data.frame() -> proteomic_20Q2_mat

data.frame(Samples = names(proteomic_20Q2_mat),
           row.names = proteomic_colData$Samples) -> proteomic_colData

## build SE object
SummarizedExperiment(assays = proteomic_20Q2_mat,
                     colData = proteomic_colData) -> proteomic_se

#################### depmap `drug_dependency_21Q4` dataset #####################
# read_csv("https://ndownloader.figshare.com/files/17741420") %>% 
#   as.data.frame() %>% dplyr::rename(depmap_id = names(.)[1]) %>% 
#   dplyr::mutate(depmap_id = gsub("_FAILED_STR", "", depmap_id)) %>% 
#   tibble::column_to_rownames(var = "depmap_id") %>% 
#   as.matrix() %>% t() -> drug_dep_19Q3_w

#################### depmap `drug_dependency_21Q4` metadata ####################
## data cleaning of `drug_sensativity` dataset and screen info
# read_csv("https://ndownloader.figshare.com/files/20237715") %>% 
#   rename(compound = column_name) -> screen_info
# eh[["EH3087"]] -> drug_dep_19Q3

############################# build depmap MAE #################################
## Combine to a named list and call the ExperimentList constructor function
list(crispr_21Q4 = crispr_21Q4_se,
     copyNumber_21Q4 = copyNumber_21Q4_se,
     TPM_21Q4 = TPM_21Q4_se,
     proteomic_20Q2 = proteomic_20Q2_se,
     # mutationCalls_21Q4 = mutationCalls_21Q4_mat,
     # drug_dep_19Q3 = drug_dep_19Q3,
     RPPA_19Q3 = RPPA_19Q3_se) -> assayList

## Use the ExperimentList constructor
exp_list <- ExperimentList(assayList)

## build colData, a DataFrame describing the characteristics of biological units
mae_coldata <- DataFrame(metadata_21Q4)

## build MAE
MultiAssayExperiment::MultiAssayExperiment(
    experiments = exp_list,
    colData = mae_coldata #,
    #sampleMap = sample_map
    ) -> depmap_mae

# saveRDS(depmap_mae, file = paste0(file_path, "depmap_mae.rds"))