Genentech / FacileDataSet

A fluent API for accessing multi-assay high-throughput genomics data.
MIT License
4 stars 0 forks source link

Identify and create a new and public (large-ish) multiassay dataset. #12

Open lianos opened 6 years ago

lianos commented 6 years ago

We'll need to have this in place to more easily showcase the FacileData API.

Some choices could include:

lianos commented 5 years ago

I guess CCLE is the only option, we can get:

  1. Gene-level expression
  2. Transcript-level expression (where?)
    • I know I've seen this already processed from somewhere, as well.
  3. Gene-level CNV values from MSSM/Harmonizome
  4. Gene-level mutation calls MSSM/Harmonizome
  5. Gene essentiality / achilles
phaverty commented 5 years ago

I’ve got a script to organize all the CCLE stuff. Will share.

On Wed, Sep 26, 2018 at 3:17 PM Steve Lianoglou notifications@github.com wrote:

I guess CCLE is the only option, we can get:

  1. Gene-level expression (where?)
  2. Transcript-level expression (where?)
  3. Gene-level CNV values from MSSM/Harmonizome http://amp.pharm.mssm.edu/Harmonizome/dataset/CCLE+Cell+Line+Gene+CNV+Profiles

— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/Genentech/FacileDataSet/issues/12#issuecomment-424887850, or mute the thread https://github.com/notifications/unsubscribe-auth/AH02KyUazseqnyP_iL5O1BnGAazyytNhks5ue_z3gaJpZM4WIMUL .

-- Pete


Peter M. Haverty, Ph.D. Genentech, Inc. phaverty@gene.com

phaverty commented 5 years ago

I use these files CCLE_copynumber_byGene_2013-12-03.txt CCLE_RNAseq_081117.rpkm.gct ccle2maf_081117.txt

from here: https://portals.broadinstitute.org/ccle/data

' Build long-form table of CCLE RPKM values

'

' Does conversion from CCLE Ensembl IDs to Entrez

' @param ccle_expr_file single character, *rpkm.gct file from CCLE FTP

' @return data.frame with ccle_name, gene_id and rpkm columns

' @export

build_ccle_expr <- function(ccle_expr_file) { expr = read_tsv(ccle_expr_file, skip = 2) colnames(expr)[1] = "Xref" expr$Xref = gsub("\.\d+$", "", expr$Xref) expr[[2]] = NULL expr_ginfo = translate(expr$Xref, org.Hs.egENSEMBL2EG, return.list = FALSE) expr_ginfo$to = paste0("GeneID:", expr_ginfo$to) colnames(expr_ginfo) = c("gene_id", "Xref") expr_ginfo = expr_ginfo[order(as.integer(gsub("ENSG", "", expr_ginfo$Xref))), ] expr_ginfo = expr_ginfo[!duplicated(expr_ginfo$gene_id), ] expr_ginfo = expr_ginfo[!duplicated(expr_ginfo$Xref), ] stopifnot(length(setdiff(expr_ginfo$Xref, expr$Xref)) == 0) stopifnot(!anyDuplicated(expr$Xref)) expr = inner_join(expr_ginfo, expr, by = "Xref") expr$Xref = NULL expr_long = melt(expr, id.vars = c("gene_id"), variable.name = "ccle_name", value.name = "rpkm") expr_long }

' Build long-form table of CCLE relative copy number values

'

' Build long-form table of CCLE relative copy number values

' @param ccle_cn_file single character, e.g.

CCLE_copynumber_byGene_2013-12-03.txt from CCLE FTP

' @return data.frame with ccle_name, gene_id and relative_cn (log2ratio)

columns

' @export

build_ccle_cn <- function(ccle_cn_file) { cn = read_tsv(ccle_cn_file) cn = dplyr::select(cn, -c(SYMBOL, CHR, CHRLOC, CHRLOCEND)) cn = dplyr::rename(cn, gene_id = EGID) cn$gene_id = paste0("GeneID:", cn$gene_id) cn_long = melt(cn, id.vars = "gene_id", variable.name = "ccle_name", value.name = "relative_cn") cn_long }

' Build table of mutation details for CCLE lines

'

' Build table of mutation details for CCLE lines

' @param ccle_mut_file single character, e.g. ccle2maf_081117.txt from

CCLE FTP

' @param genes data.frame, from build_genes_table

' @param ccle_sinfo data.frame, from sinfo_from_ceres

' @return data.frame

' @export

build_ccle_fullgene <- function(ccle_mut_file, genes, ccle_sinfo) { fullgene = read_tsv(ccle_mut_file, col_types = "cccciicccccccccccccllilidccccccc") fullgene = dplyr::select(fullgene, -Hugo_Symbol) fullgene = dplyr::rename(fullgene, gene_id = Entrez_Gene_Id, ccle_name = Tumor_Sample_Barcode) fullgene = filter(fullgene, gene_id != 0) # WTF, Broad? Other id/symbol pairs seem ok

fullgene$gene_id = paste0("GeneID:", fullgene$gene_id)

## Add allele frequency, allele counts seem to be ALT:REF as there are

lots of X:0

## and no 0:X x = group_by(fullgene, ccle_name) foo = summarize(x, WES

=

## all(is.na(WES_AC)), WGS = all(is.na(WGS_AC)), RNA =

all(is.na(RNAseq_AC)))

## variants appear to mostly be from RNASeq

fullgene$joint_AC = ifelse(!is.na(fullgene$WGS_AC), fullgene$WGS_AC,

ifelse(!is.na(fullgene$WES_AC),

fullgene$WES_AC, ifelse(!is.na(fullgene$RNAseq_AC), fullgene$RNAseq_AC,

fullgene$SangerRecalibWES_AC))) fullgene$altcount = as.integer(gsub("^(\d+):.$", "\1", fullgene$joint_AC)) fullgene$refcount = as.integer(gsub("^.:(\d+)$", "\1", fullgene$joint_AC)) fullgene$variant_frequency = fullgene$altcount/(fullgene$altcount + fullgene$refcount) fullgene$altcount = fullgene$refcount = fullgene$joint_AC = NULL

## Assign hotspot and lof status

fullgene = mutate(fullgene, is_hot = isTCGAhotspot | isCOSMIChotspot)
fullgene = mutate(fullgene, is_lof = is_hot | (isDeleterious & (ExAC_AF

< 0.01 | is.na(ExAC_AF))))

## Join in gene_symbol and tissue

fullgene = left_join(dplyr::select(genes, gene_id, gene_symbol),

fullgene, by = "gene_id") fullgene = left_join(dplyr::select(ccle_sinfo, ccle_name, cell_line, tissue), fullgene, by = "ccle_name") fullgene = dplyr::select(fullgene, gene_id, gene_symbol, ccle_name, cell_line, tissue, everything())

fullgene

}

' Build long-form summary table of gene mutation status

'

' Contains loss-of-function and hotspot annotations per sample and gene

' @param fullgene data.frame from build_ccle_fullgene()

' @return data.frame

' @export

build_ccle_mut <- function(fullgene) {

Make lof and hotspot tables

## -Inf varfreq becomes 0

lof_long = dplyr::summarize(group_by(filter(fullgene, is_lof), gene_id,

ccle_name), max_lof_varfreq = max(variant_frequency, na.rm = TRUE))

hot_long = dplyr::summarize(group_by(filter(fullgene, is_hot), gene_id,

ccle_name), max_hot_varfreq = max(variant_frequency, na.rm = TRUE))

mut_long = full_join(lof_long, hot_long, by = c("gene_id", "ccle_name"))
mut_long = mutate(
    mut_long,
    max_lof_varfreq = ifelse(is.infinite(max_lof_varfreq), 0,

max_lof_varfreq), max_hot_varfreq= ifelse(is.infinite(max_hot_varfreq), 0, max_hot_varfreq) ) mut_long }

' Join ceres, expression, copy number and mutation status tables

'

' Join ceres, expression, copy number and mutation status tables

' @param ceres_long data.frame, from build_ceres_table

' @param expr_long data.frame, from build_ccle_expr

' @param cn_long data.frame, from build_ccle_cn

' @param mut_long data.frame, from build_ccle_mut

' @param genes data.frame, from build_genes_table or

add_gene_drugability

' @param ccle_sinfo data.frame, from sinfo_from_ceres

' @return data.frame

' @export

build_ccle_genomics <- function(ceres_long, expr_long, cn_long, mut_long, genes, ccle_sinfo) {

expr_long = filter(expr_long, ccle_name %in% ccle_sinfo$ccle_name)
cn_long = filter(cn_long, ccle_name %in% ccle_sinfo$ccle_name)
mut_long = filter(mut_long, ccle_name %in% ccle_sinfo$ccle_name)

stopifnot(!anyDuplicated(ceres_long[, c("gene_id", "ccle_name")]))
stopifnot(!anyDuplicated(expr_long[, c("gene_id", "ccle_name")]))
stopifnot(!anyDuplicated(cn_long[, c("gene_id", "ccle_name")]))
stopifnot(!anyDuplicated(mut_long[, c("gene_id", "ccle_name")]))

genomics_long = full_join(expr_long, cn_long, by = c("gene_id",

"ccle_name")) genomics_long = full_join(genomics_long, mut_long, by = c("gene_id", "ccle_name"))

lines_missing_genomics = setdiff(ceres_long$ccle_name,

genomics_long$ccle_name)

write.csv(lines_missing_genomics, file =

"broad_lines_with_ceres_not_ccle.csv")

genomics_long = genomics_long[genomics_long$gene_id %in%

ceres_long$gene_id, ] genomics_long = full_join(genomics_long, ceres_long, by = c("gene_id", "ccle_name")) genomics_long = left_join(dplyr::select(genes, gene_symbol, gene_id), genomics_long, by = "gene_id") genomics_long = left_join(dplyr::select(ccle_sinfo, ccle_name, cell_line, tissue), genomics_long, by = "ccle_name") genomics_long = dplyr::select(genomics_long, gene_id, gene_symbol, ccle_name, cell_line, tissue, everything())

genomics_long

}

Pete


Peter M. Haverty, Ph.D. Genentech, Inc. phaverty@gene.com

On Wed, Sep 26, 2018 at 5:31 PM, Pete Haverty phaverty@gene.com wrote:

I’ve got a script to organize all the CCLE stuff. Will share.

On Wed, Sep 26, 2018 at 3:17 PM Steve Lianoglou notifications@github.com wrote:

I guess CCLE is the only option, we can get:

  1. Gene-level expression (where?)
  2. Transcript-level expression (where?)
  3. Gene-level CNV values from MSSM/Harmonizome http://amp.pharm.mssm.edu/Harmonizome/dataset/CCLE+Cell+Line+Gene+CNV+Profiles

— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/Genentech/FacileDataSet/issues/12#issuecomment-424887850, or mute the thread https://github.com/notifications/unsubscribe-auth/AH02KyUazseqnyP_iL5O1BnGAazyytNhks5ue_z3gaJpZM4WIMUL .

-- Pete


Peter M. Haverty, Ph.D. Genentech, Inc. phaverty@gene.com

lianos commented 5 years ago

Thanks, Pete: this should prove very helpful!