YuLab-SMU / clusterProfiler

:bar_chart: A universal enrichment tool for interpreting omics data
https://yulab-smu.top/biomedical-knowledge-mining-book/
1k stars 253 forks source link

Simplify to work on enricher() result when using GO collection from MSigDB #369

Open BioinformaNicks opened 3 years ago

BioinformaNicks commented 3 years ago

So I'm trying to run enricher() on both the MSigDB Hallmark set and the C5 (GO) set, by doing the following:

m_t2g <- msigdbr::msigdbr(species = "Homo sapiens", category = c('H')) %>% 
    dplyr::select(gs_name, entrez_gene) %>% dplyr::distinct(gs_name, entrez_gene)

  go_m_t2g <- msigdbr::msigdbr(species = "Homo sapiens", category = c('C5')) %>% 
    dplyr::select(gs_name, entrez_gene) %>% dplyr::distinct(gs_name, entrez_gene)

  m_t2g <- bind_rows(m_t2g, go_m_t2g)

enriched <- enricher(names(enrich_ready), TERM2GENE=m_t2g)

However, this will lead to a lot of redundant GO terms in the enrichResult object. While simplify() can be applied to enrichGO results, it can not be applied to enricher() results when using GO ontology. Is there a possibility of including this functionality?

Alternatively, do you know of any way to filter out the redundancy in the msigdbr C5 (GO) selection beforehand?

BioinformaNicks commented 3 years ago

So I think I got it working with a workaround:

m_t2g <- msigdbr::msigdbr(species = "Homo sapiens", category = c('H')) %>% 
    dplyr::select(gs_name, entrez_gene) %>% dplyr::distinct(gs_name, entrez_gene)

  go_m_t2g <- msigdbr::msigdbr(species = "Homo sapiens", category = c('C5')) %>% filter(., gs_subcat != 'HPO') %>%
    dplyr::select(gs_name, entrez_gene) %>% dplyr::distinct(gs_name, entrez_gene)

  m_t2g <- bind_rows(m_t2g, go_m_t2g)

  go_jointable <- msigdbr::msigdbr(species = "Homo sapiens", category = c('C5')) %>% filter(., gs_subcat != 'HPO')

then

enriched <- enricher(names(enrich_ready), TERM2GENE=m_t2g)

      enricher_result <- enriched@result
      enricher_result <- enricher_result %>% mutate(go_jointable[match(enricher_result$ID, go_jointable$gs_name), c('gs_name', 'gs_subcat', 'gs_exact_source')])
      hallmark_result <- filter(enricher_result, is.na(gs_name)) %>% select(!c('gs_name', 'gs_subcat', 'gs_exact_source'))
      go_result <- filter(enricher_result, !is.na(gs_name))
      go_result <- mutate(go_result, ONTOLOGY = str_split(go_result$gs_subcat, ':', simplify = T)[,2])
      go_result$ID <- go_result$gs_exact_source
      go_result <- go_result %>% select(c("ONTOLOGY","ID","Description","GeneRatio","BgRatio","pvalue","p.adjust","qvalue","geneID","Count" ))
      rownames(go_result) <- go_result$ID
      go_result_bp <- filter(go_result, ONTOLOGY == 'BP')
      go_result_mf <- filter(go_result, ONTOLOGY == 'MF')
      go_result_cc <- filter(go_result, ONTOLOGY == 'CC')

      enriched@ontology <- 'BP'
      enriched@keytype <- "ENTREZID"
      enriched@organism <- "Homo sapiens"

      enriched@result <- go_result_bp
      enriched <- simplify(enriched)
      simplified_bp <- enriched@result

      enriched@result <- go_result_mf
      enriched <- simplify(enriched)
      simplified_mf <- enriched@result

      enriched@result <- go_result_cc
      enriched <- simplify(enriched)
      simplified_cc <- enriched@result

      total_simplified <- bind_rows(simplified_bp, simplified_mf, simplified_cc) %>% select(!ONTOLOGY) %>% mutate(ID = Description)
      rownames(total_simplified) <- total_simplified$ID
      total_simplified <- bind_rows(total_simplified, hallmark_result)

      enriched@result <- total_simplified

      enriched@result <- enriched@result[order(enriched@result$p.adjust),]