lgeistlinger / EnrichmentBrowser

Seamless navigation through combined results of set-based and network-based enrichment analysis
20 stars 11 forks source link

Error in .org2pkg(anno): unrecognized organism ID 'mcf' #49

Open Shi-YuZhang opened 10 months ago

Shi-YuZhang commented 10 months ago

Thank you for the convenient tool. However, when I run mcf <- getGenesets(org = "mcf", db = "kegg", gene.id.type = "SYMBOL",cache = TRUE, return.type="list"), it returns

Error in .org2pkg(anno): unrecognized organism ID 'mcf'
Traceback:

1. getGenesets(org = "mcf", db = "kegg", gene.id.type = "SYMBOL", 
 .     cache = TRUE, return.type = "list")
2. .getKEGG(org, gene.id.type, cache, return.type)
3. .dwnldAllKeggGS(pwys, gene.id.type, cache, return.type)
4. suppressMessages(gs <- idMap(gs, org, from = "ENTREZID", to = gene.id.type))
5. withCallingHandlers(expr, message = function(c) if (inherits(c, 
 .     classes)) tryInvokeRestart("muffleMessage"))
6. idMap(gs, org, from = "ENTREZID", to = gene.id.type)
7. .idMapGS(obj, org, from, to, multi.to)
8. .mapStats(sgenes, org, from, to, multi.to)
9. .getAnnoPkg(org)
10. .org2pkg(anno)
11. stop(paste0("unrecognized organism ID '", org, "'"))

I had checked the org three-letter code in KEGG (https://www.genome.jp/kegg/catalog/org_list.html), "mcf" means Macaca fascicularis (crab-eating macaque).

lgeistlinger commented 10 months ago

Note that gene ID mapping is currently only available for certain model organisms for which a Bioconductor OrgDb package exist, ie. for those organisms:

> BiocManager::available("^org\\.[A-Z]")
 [1] "org.Ag.eg.db"      "org.At.tair.db"    "org.Bt.eg.db"     
 [4] "org.Ce.eg.db"      "org.Cf.eg.db"      "org.Dm.eg.db"     
 [7] "org.Dr.eg.db"      "org.EcK12.eg.db"   "org.EcSakai.eg.db"
[10] "org.Gg.eg.db"      "org.Hs.eg.db"      "org.Mm.eg.db"     
[13] "org.Mmu.eg.db"     "org.Mxanthus.db"   "org.Pt.eg.db"     
[16] "org.Rn.eg.db"      "org.Sc.sgd.db"     "org.Ss.eg.db"     
[19] "org.Xl.eg.db"

That said, you can turn to AnnotationHub and achieve your goal with a couple of extra commands like that:

  1. Obtain the OrgDb package for Macaca fascicularis from AnnotationHub:
> library(AnnotationHub)
> ah <- AnnotationHub()
> query(ah, c("OrgDb", "Macaca"))
AnnotationHub with 5 records
# snapshotDate(): 2023-10-20
# $dataprovider: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/
# $species: Macaca nemestrina, Macaca mulatta, Macaca irus, Macaca fascicula...
# $rdataclass: OrgDb
# additional mcols(): taxonomyid, genome, description,
#   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
#   rdatapath, sourceurl, sourcetype 
# retrieve records with, e.g., 'object[["AH114088"]]' 

             title                            
  AH114088 | org.Mmu.eg.db.sqlite             
  AH114408 | org.Macaca_nemestrina.eg.sqlite  
  AH114563 | org.Macaca_cynomolgus.eg.sqlite  
  AH114564 | org.Macaca_fascicularis.eg.sqlite
  AH114565 | org.Macaca_irus.eg.sqlite        

> org.db <- ah[["AH114564"]]
  1. Obtain gene sets for Macaca fascicularis using NCBI Entrez Gene IDs:
> library(EnrichmentBrowser)
> mcf <- getGenesets(org = "mcf", db = "kegg")
> mcf[1:3]
$`mcf00010_Glycolysis_/_Gluconeogenesis`
 [1] "101865076" "101865104" "101865220" "101865678" "101866112" "101866226"
 [7] "101866506" "101866609" "101867330" "101925242" "101925472" "101925598"
[13] "101925612" "101925639" "101925921" "101926834" "102114947" "102115022"
[19] "102116429" "102116859" "102117828" "102118498" "102119787" "102120140"
[25] "102120757" "102120771" "102120901" "102121518" "102121717" "102121906"
[31] "102122039" "102122135" "102122172" "102122307" "102122664" "102123018"
[37] "102123026" "102123383" "102123413" "102124145" "102124195" "102125243"
[43] "102125491" "102125557" "102125836" "102125852" "102127599" "102128279"
[49] "102129493" "102129929" "102130622" "102130836" "102130971" "102131289"
[55] "102131754" "102131932" "102133548" "102133980" "102134594" "102134909"
[61] "102135856" "102137352" "102138258" "102138838" "102138923" "102139491"
[67] "102139551" "102139699" "102139794" "102140005" "102140050" "102140136"
[73] "102140922" "102140927" "102140991" "102141145" "102141498" "102142152"
[79] "102143796" "102144419" "102144494" "102144820" "102145864" "102147228"
[85] "107126374" "123572777"

$`mcf00020_Citrate_cycle_(TCA_cycle)`
 [1] "101865050" "101865119" "101865736" "101866578" "101867049" "101867276"
 [7] "101867314" "101925784" "101925790" "101925901" "101926620" "102118371"
[13] "102122039" "102122263" "102124727" "102125836" "102125852" "102126003"
[19] "102127185" "102127751" "102129545" "102129952" "102131289" "102132064"
[25] "102134309" "102134909" "102135856" "102139439" "102140136" "102140991"
[31] "102142663" "102144513" "102145238" "102145957" "102146568"

$mcf00030_Pentose_phosphate_pathway
 [1] "101865104" "101865220" "101865918" "101865992" "101866782" "101866917"
 [7] "101867414" "101925639" "101925866" "101925921" "102115057" "102115771"
[13] "102117021" "102120422" "102124428" "102125557" "102129493" "102129791"
[19] "102130622" "102130836" "102132916" "102134335" "102135829" "102136564"
[25] "102137352" "102138258" "102140005" "102140428" "102140927" "102141498"
[31] "102142152" "102144892" "102146745"
  1. Map the gene sets from Entrez IDs to gene symbols:
> library(AnnotationDbi)
> mcf.sym <- lapply(mcf, function(s) mapIds(org.db, keys = s, keytype = "ENTREZID", column = "SYMBOL"))
> mcf.sym <- lapply(mcf.sym, unname)
> mcf.sym <- lapply(mcf.sym, sort)
> mcf.sym[1:3]
$`mcf00010_Glycolysis_/_Gluconeogenesis`
 [1] "ACSS1"        "ACSS2"        "ADH4"         "ADH5"         "ADH6"        
 [6] "ADH7"         "ADPGK"        "AKR1A1"       "ALDH1B1"      "ALDH2"       
[11] "ALDH3A1"      "ALDH3A2"      "ALDH3B1"      "ALDH7A1"      "ALDH9A1"     
[16] "ALDOA"        "ALDOB"        "ALDOC"        "BPGM"         "DLAT"        
[21] "DLD"          "ENO1"         "ENO2"         "ENO3"         "ENO4"        
[26] "FBP1"         "FBP2"         "G6PC1"        "G6PC2"        "G6PC3"       
[31] "GALM"         "GAPDH"        "GAPDHS"       "GCK"          "GPI"         
[36] "HK1"          "HK2"          "HK3"          "HKDC1"        "LDHA"        
[41] "LDHAL6A"      "LDHAL6B"      "LDHB"         "LDHC"         "LOC102115022"
[46] "LOC102116859" "LOC102117828" "LOC102118498" "LOC102119787" "LOC102120757"
[51] "LOC102121717" "LOC102122039" "LOC102122172" "LOC102122664" "LOC102123018"
[56] "LOC102123383" "LOC102123413" "LOC102124145" "LOC102124195" "LOC102130971"
[61] "LOC102131932" "LOC102138258" "LOC102139491" "LOC102140005" "LOC102140927"
[66] "LOC102144419" "LOC102144820" "LOC123572777" "MINPP1"       "PCK1"        
[71] "PCK2"         "PDHA1"        "PDHA2"        "PDHB"         "PFKL"        
[76] "PFKM"         "PFKP"         "PGAM1"        "PGAM2"        "PGK1"        
[81] "PGK2"         "PGM1"         "PGM2"         "PKLR"         "PKM"         
[86] "TPI1"        

$`mcf00020_Citrate_cycle_(TCA_cycle)`
 [1] "ACLY"         "ACO1"         "ACO2"         "CS"           "DLAT"        
 [6] "DLD"          "DLST"         "FH"           "IDH1"         "IDH2"        
[11] "IDH3A"        "IDH3B"        "IDH3G"        "LOC102122039" "LOC102127185"
[16] "LOC102127751" "LOC102129952" "LOC102134309" "MDH1"         "MDH2"        
[21] "OGDH"         "OGDHL"        "PC"           "PCK1"         "PCK2"        
[26] "PDHA1"        "PDHA2"        "PDHB"         "SDHA"         "SDHB"        
[31] "SDHC"         "SDHD"         "SUCLA2"       "SUCLG1"       "SUCLG2"      

$mcf00030_Pentose_phosphate_pathway
 [1] "ALDOA"        "ALDOB"        "ALDOC"        "DERA"         "FBP1"        
 [6] "FBP2"         "G6PD"         "GLYCTK"       "GPI"          "H6PD"        
[11] "IDNK"         "LOC102138258" "LOC102140005" "LOC102140927" "PFKL"        
[16] "PFKM"         "PFKP"         "PGD"          "PGLS"         "PGM1"        
[21] "PGM2"         "PRPS1"        "PRPS1L1"      "PRPS2"        "RBKS"        
[26] "RGN"          "RPE"          "RPIA"         "SHPK"         "TALDO1"      
[31] "TKT"          "TKTL1"        "TKTL2"