jump-cellpainting / JUMP-Target

Lists and 384-well plate maps of compounds and genetic perturbations designed to assess connectivity in profiling assays
MIT License
16 stars 5 forks source link

Clarify that a compound may have more annotated targets than indicated in the metadata #8

Open shntnu opened 3 years ago

shntnu commented 3 years ago

The full list of targets is available here https://github.com/jump-cellpainting/jump-cellpainting/blob/master/0.design-pilots/output/drug_target_samples.csv

These 7 compounds are not listed in that CSV, so we'd need to fetch this from somewhere else, probably https://github.com/broadinstitute/lincs-cell-painting

broad_sample gene pert_id
BRD-K01824976-300-02-9 RPL23A BRD-K01824976
BRD-K01825690-065-01-9 CYP1A2 BRD-K01825690
BRD-K75390981-300-04-9 HRH4 BRD-K75390981
BRD-K66430217-001-03-8 CLK1 BRD-K66430217
BRD-K01826724-003-07-9 KCTD16 BRD-K01826724
BRD-K01825589-310-02-9 KCNH7 BRD-K01825589
BRD-K01825660-322-01-9 TBXAS1 BRD-K01825660
shntnu commented 3 years ago

Notebook to produce

R Notebook

library(tidyverse)
dfmc <- read_tsv("JUMP-Target-add_files/JUMP-Target_compound_metadata.tsv")

drug_target_samples <- read_csv("https://raw.githubusercontent.com/jump-cellpainting/jump-cellpainting/master/0.design-pilots/output/drug_target_samples.csv?token=AAJHQPEBFYWZ2CEVJUZ7IHC77BEAK")
dfmc_pert_iname <- 
  dfmc %>% 
  select(pert_iname) %>%
  mutate(pert_iname = str_split(pert_iname, "\\|")) %>%
  unnest(cols = c(pert_iname)) %>%
  mutate(pert_iname = str_trim(pert_iname)) %>%
  distinct()
dfmc_pert_iname %>%
  count() %>%
  knitr::kable()
n
304
dfmc_pert_iname %>%
  inner_join(drug_target_samples %>% distinct(pert_iname, target)) %>%
  distinct(pert_iname) %>%
  count() %>%
  knitr::kable()
## Joining, by = "pert_iname"
n
304
dfmc_full_targets <-
  dfmc %>%
  mutate(pert_iname = str_split(pert_iname, "\\|")) %>%
  unnest(cols = c(pert_iname)) %>%
  mutate(pert_iname = str_trim(pert_iname)) %>%
  distinct(pert_iname, broad_sample) %>%
  inner_join(dfmc_pert_iname %>%
               inner_join(drug_target_samples %>%
                            distinct(pert_iname, target)))
## Joining, by = "pert_iname"
## Joining, by = "pert_iname"
dfmc_full_targets <-
  bind_rows(dfmc_full_targets,
            dfmc %>%
              distinct(broad_sample, pert_iname, target)) %>%
  distinct(broad_sample, pert_iname, target)
dfmc_full_targets %>%
  distinct(pert_iname) %>%
  count() %>%
  knitr::kable()
n
305
dfmc_full_targets %>%
  mutate(pert_id = str_sub(broad_sample, 1, 13)) %>%
  distinct(pert_id) %>%
  count() %>%
  knitr::kable()
n
306
dfmc_full_targets %>%
  write_csv("JUMP-Target_compound_metadata_all_targets.csv")
dfmc <- read_tsv("JUMP-Target-add_files/JUMP-Target_compound_metadata.tsv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   broad_sample = col_character(),
##   InChIKey = col_character(),
##   pert_iname = col_character(),
##   pubchem_cid = col_double(),
##   target = col_character(),
##   pert_type = col_character(),
##   control_type = col_character(),
##   smiles = col_character()
## )
dfmo <- read_tsv("JUMP-Target-add_files/JUMP-Target_orf_metadata.tsv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   broad_sample = col_character(),
##   genes = col_character(),
##   pert_type = col_character(),
##   control_type = col_character()
## )
dfmx <- read_tsv("JUMP-Target-add_files/JUMP-Target_crispr_metadata.tsv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   broad_sample = col_character(),
##   genes = col_character(),
##   pert_type = col_character(),
##   control_type = col_character(),
##   target_sequence = col_character()
## )
dfmc1 <- dfmc_full_targets %>% distinct(broad_sample, pert_iname, target) %>% select(broad_sample_compound = broad_sample, pert_iname, gene = target)
dfmo1 <- dfmo %>% distinct(broad_sample, genes) %>% select(broad_sample_orf = broad_sample, gene = genes)
dfmx1 <- dfmx %>% distinct(broad_sample, genes) %>% select(broad_sample_crispr = broad_sample, gene = genes) %>% na.omit()

dfcg1 <- dfmc1 %>% distinct(gene) %>% na.omit() %>% pull("gene")
dfog <- dfmo1 %>% distinct(gene) %>% na.omit() %>% pull("gene")
dfxg <- dfmx1 %>% distinct(gene) %>% na.omit() %>% pull("gene")

connections <-
  dfmc1 %>% 
  inner_join(dfmo1) %>% 
  inner_join(dfmx1) %>%
  select(gene, broad_sample_compound, pert_iname, broad_sample_orf, broad_sample_crispr)
## Joining, by = "gene"

## Joining, by = "gene"
connections %>%
  distinct(gene) %>%
  count %>%
  knitr::kable()
n
160
connections %>%
  distinct(broad_sample_compound, broad_sample_orf) %>%
  count %>%
  knitr::kable()
n
466
connections %>%
  distinct(broad_sample_compound, broad_sample_crispr) %>%
  count %>%
  knitr::kable()
n
893
connections %>%
  distinct(broad_sample_compound, broad_sample_orf) %>%
  count %>%
  knitr::kable()
n
466
connections %>%
  distinct(broad_sample_orf, broad_sample_crispr) %>%
  count %>%
  knitr::kable()
n
305
connections %>%
  write_csv("JUMP-Target_compounds_crispr_orf_connections.csv")
connections %>%
  distinct(broad_sample_compound, broad_sample_orf) %>%
  group_by(broad_sample_orf) %>%
  tally(name = "n_compounds") %>% 
  group_by(n_compounds) %>%
  tally(name = "n_orfs") %>%
  select(n_orfs, n_compounds) %>% 
  knitr::kable()
n_orfs n_compounds
6 1
83 2
34 3
19 4
8 5
4 6
1 7
2 8
1 9
2 10