Closed sdgamboa closed 1 year ago
The 'yes'/'no' comes from how the data is presented in the API, so I think it means 'yes' it is produced or 'no' it isn't produced. Maybe the 'no' values aren't so helpful. But I can alter the script that gets the BacDive data so that it produces data that makes more sense or uses ontology terms where possible. This is what I get for this strain via the API:
{
"count": 1,
"next": null,
"previous": null,
"results": {
"15070": {
"General": {
"@ref": 9412,
"BacDive-ID": 15070,
"DSM-Number": 40258,
"keywords": [
"antibiotic compound production",
"mesophilic",
"Bacteria",
"16S sequence",
"genome sequence"
],
"description": "Streptomyces catenulae 6563 is a mesophilic bacterium that produces antibiotic compounds.",
"NCBI tax id": {
"NCBI tax id": 66875,
"Matching level": "species"
},
"strain history": [
"<- E.B. Shirling, ISP <- J. Routien, 6563",
"KCC S-0353 <-- IFO 12848 <-- SAJ <-- ISP 5258 <-- Chas. Pfizer & Co.; 6563."
],
"doi": "10.13145/bacdive15070.20220920.7"
},
"Name and taxonomic classification": {
"LPSN": {
"@ref": 20215,
"description": "domain/bacteria",
"keyword": "phylum/actinomycetota",
"domain": "Bacteria",
"phylum": "Actinomycetota",
"class": "Actinomycetes",
"order": "Streptomycetales",
"family": "Streptomycetaceae",
"genus": "Streptomyces",
"species": "Streptomyces catenulae",
"full scientific name": "Streptomyces catenulae Davisson and Finlay 1961 (Approved Lists 1980)"
},
"@ref": 9412,
"domain": "Bacteria",
"phylum": "Actinobacteria",
"class": "Actinobacteria",
"order": "Actinomycetales",
"family": "Streptomycetaceae",
"genus": "Streptomyces",
"species": "Streptomyces catenulae",
"full scientific name": "Streptomyces catenulae Davisson and Finlay 1961 emend. Nouioui et al. 2018",
"strain designation": "6563",
"type strain": "yes"
},
"Morphology": {
"multimedia": {
"@ref": 9412,
"multimedia content": "https://www.dsmz.de/microorganisms/photos/DSM_40258.jpg",
"caption": "Medium 65 28\u00b0C",
"intellectual property rights": "\u00a9 Leibniz-Institut DSMZ"
}
},
"Culture and growth conditions": {
"culture medium": {
"@ref": 9412,
"name": "GYM STREPTOMYCES MEDIUM (DSMZ Medium 65)",
"growth": "yes",
"link": "https://bacmedia.dsmz.de/medium/65",
"composition": "Name: GYM STREPTOMYCES MEDIUM (DSMZ Medium 65)\nComposition:\nAgar 12.0 g/l\nMalt extract 10.0 g/l\nYeast extract 4.0 g/l\nGlucose 4.0 g/l\nCaCO3 2.0 g/l\nDistilled water"
},
"culture temp": [
{
"@ref": 18571,
"growth": "positive",
"type": "optimum",
"temperature": "28",
"range": "mesophilic"
},
{
"@ref": 9412,
"growth": "positive",
"type": "growth",
"temperature": "28",
"range": "mesophilic"
},
{
"@ref": 67770,
"growth": "positive",
"type": "growth",
"temperature": "28",
"range": "mesophilic"
}
]
},
"Physiology and metabolism": {
"tolerance": {
"@ref": 18571,
"compound": "Lysozyme",
"percentage": "1"
},
"compound production": [
{
"@ref": 9412,
"compound": "pepsinostreptin"
},
{
"@ref": 9412,
"compound": "catenulin"
},
{
"@ref": 9412,
"compound": "paramomycin"
},
{
"@ref": 20216,
"compound": "Neomycin E"
},
{
"@ref": 20216,
"compound": "Neomycin F"
},
{
"@ref": 20216,
"compound": "pepsinostreptin complex"
}
],
"halophily": {
"@ref": 18571,
"salt": "NaCl",
"growth": "positive",
"tested relation": "maximum",
"concentration": "5 %"
},
"observation": {
"@ref": 67770,
"observation": "quinones: MK-9(H6), MK-9(H4)"
},
"metabolite utilization": [
{
"@ref": 18571,
"Chebi-ID": 17234,
"metabolite": "glucose",
"utilization activity": "+"
},
{
"@ref": 18571,
"Chebi-ID": 22599,
"metabolite": "arabinose",
"utilization activity": "-"
},
{
"@ref": 18571,
"Chebi-ID": 17992,
"metabolite": "sucrose",
"utilization activity": "-"
},
{
"@ref": 18571,
"Chebi-ID": 18222,
"metabolite": "xylose",
"utilization activity": "-"
},
{
"@ref": 18571,
"Chebi-ID": 17268,
"metabolite": "myo-inositol",
"utilization activity": "-"
},
{
"@ref": 18571,
"Chebi-ID": 29864,
"metabolite": "mannitol",
"utilization activity": "-"
},
{
"@ref": 18571,
"Chebi-ID": 28757,
"metabolite": "fructose",
"utilization activity": "+/-"
},
{
"@ref": 18571,
"Chebi-ID": 26546,
"metabolite": "rhamnose",
"utilization activity": "-"
},
{
"@ref": 18571,
"Chebi-ID": 16634,
"metabolite": "raffinose",
"utilization activity": "-"
},
{
"@ref": 18571,
"Chebi-ID": 62968,
"metabolite": "cellulose",
"utilization activity": "-"
},
{
"@ref": 68368,
"Chebi-ID": 5291,
"metabolite": "gelatin",
"utilization activity": "+",
"kind of utilization tested": "hydrolysis"
},
{
"@ref": 68368,
"Chebi-ID": 27897,
"metabolite": "tryptophan",
"utilization activity": "-",
"kind of utilization tested": "energy source"
},
{
"@ref": 68368,
"Chebi-ID": 16199,
"metabolite": "urea",
"utilization activity": "+",
"kind of utilization tested": "hydrolysis"
},
{
"@ref": 68368,
"Chebi-ID": 16947,
"metabolite": "citrate",
"utilization activity": "-",
"kind of utilization tested": "assimilation"
},
{
"@ref": 68368,
"Chebi-ID": 18257,
"metabolite": "ornithine",
"utilization activity": "-",
"kind of utilization tested": "degradation"
},
{
"@ref": 68368,
"Chebi-ID": 25094,
"metabolite": "lysine",
"utilization activity": "-",
"kind of utilization tested": "degradation"
},
{
"@ref": 68368,
"Chebi-ID": 29016,
"metabolite": "arginine",
"utilization activity": "-",
"kind of utilization tested": "hydrolysis"
}
],
"metabolite production": [
{
"@ref": 68368,
"Chebi-ID": 15688,
"metabolite": "acetoin",
"production": "yes"
},
{
"@ref": 68368,
"Chebi-ID": 35581,
"metabolite": "indole",
"production": "no"
},
{
"@ref": 68368,
"Chebi-ID": 16136,
"metabolite": "hydrogen sulfide",
"production": "no"
},
{
"@ref": 67770,
"Chebi-ID": 7934,
"metabolite": "paromomycin",
"production": "yes"
}
],
"metabolite tests": [
{
"@ref": 68368,
"Chebi-ID": 15688,
"metabolite": "acetoin",
"voges-proskauer-test": "+"
},
{
"@ref": 68368,
"Chebi-ID": 35581,
"metabolite": "indole",
"indole test": "-"
}
],
"enzymes": [
{
"@ref": 68368,
"value": "gelatinase",
"activity": "+"
},
{
"@ref": 68368,
"value": "tryptophan deaminase",
"activity": "+",
"ec": "4.1.99.1"
},
{
"@ref": 68368,
"value": "urease",
"activity": "+",
"ec": "3.5.1.5"
},
{
"@ref": 68368,
"value": "ornithine decarboxylase",
"activity": "-",
"ec": "4.1.1.17"
},
{
"@ref": 68368,
"value": "lysine decarboxylase",
"activity": "-",
"ec": "4.1.1.18"
},
{
"@ref": 68368,
"value": "arginine dihydrolase",
"activity": "-",
"ec": "3.5.3.6"
},
{
"@ref": 68368,
"value": "beta-galactosidase",
"activity": "-",
"ec": "3.2.1.23"
}
],
"API 20E": {
"@ref": 18571,
"ONPG": "-",
"ADH Arg": "-",
"LDC Lys": "-",
"ODC": "-",
"CIT": "-",
"H2S": "-",
"URE": "+",
"TDA Trp": "+",
"IND": "-",
"VP": "+",
"GEL": "+"
}
},
"Isolation, sampling and environmental information": {},
"Safety information": {
"risk assessment": [
{
"@ref": 18571,
"biosafety level": "1",
"biosafety level comment": "German classification"
},
{
"@ref": 9412,
"biosafety level": "1",
"biosafety level comment": "Risk group (German classification)"
}
]
},
"Sequence information": {
"16S sequences": [
{
"@ref": 20218,
"description": "Streptomyces catenulae 16S rRNA gene, type strain DSM 40258T",
"accession": "AJ621613",
"length": 1498,
"database": "ena",
"NCBI tax ID": 66875
},
{
"@ref": 20218,
"description": "Streptomyces catenulae gene for 16S rRNA, partial sequence",
"accession": "AB122748",
"length": 565,
"database": "ena",
"NCBI tax ID": 66875
},
{
"@ref": 20218,
"description": "Streptomyces catenulae strain ISP 5258 16S ribosomal RNA gene, partial sequence",
"accession": "AY999778",
"length": 1497,
"database": "ena",
"NCBI tax ID": 66875
},
{
"@ref": 20218,
"description": "Streptomyces catenulae gene for 16S ribosomal RNA, partial sequence, strain: JCM 4353",
"accession": "D44071",
"length": 121,
"database": "ena",
"NCBI tax ID": 66875
},
{
"@ref": 20218,
"description": "Streptomyces catenulae gene for 16S rRNA, partial sequence, strain: NBRC 12848",
"accession": "AB184191",
"length": 1480,
"database": "ena",
"NCBI tax ID": 66875
}
],
"Genome sequences": {
"@ref": 67770,
"description": "Streptomyces catenulae strain NRRL B-2342, whole genome shotgun sequencing project",
"accession": "JODY00000000",
"database": "ncbi",
"NCBI tax ID": 66875
},
"GC content": {
"@ref": 67770,
"GC-content": "73",
"method": "genome sequence analysis"
}
},
"External links": {
"@ref": 9412,
"culture collection no.": "DSM 40258, ATCC 12476, ATCC 23893, CBS 679.68, IFO 12848, ISP 5258, NBRC 12848, RIA 1183, JCM 4353, BCRC 12092, CGMCC 4.1701, HAMBI 986, IMET 42944, KCTC 9223, NRRL B-2342, VKM Ac-758",
"straininfo link": [
{
"@ref": 20218,
"passport": "http://www.straininfo.net/strains/165138"
},
{
"@ref": 20218,
"passport": "http://www.straininfo.net/strains/122455"
},
{
"@ref": 20218,
"passport": "http://www.straininfo.net/strains/122456"
},
{
"@ref": 20218,
"passport": "http://www.straininfo.net/strains/230064"
},
{
"@ref": 20218,
"passport": "http://www.straininfo.net/strains/122458"
},
{
"@ref": 20218,
"passport": "http://www.straininfo.net/strains/122459"
},
{
"@ref": 20218,
"passport": "http://www.straininfo.net/strains/334379"
},
{
"@ref": 20218,
"passport": "http://www.straininfo.net/strains/122462"
}
],
"literature": {
"topic": "Phylogeny",
"Pubmed-ID": "35651486",
"title": "Streptomyces benahoarensis sp. nov. Isolated From a Lava Tube of La Palma, Canary Islands, Spain.",
"authors": "Gonzalez-Pimentel JL, Hermosin B, Saiz-Jimenez C, Jurado V",
"journal": "Front Microbiol",
"DOI": "10.3389/fmicb.2022.907816",
"year": 2022
}
},
"Reference": [
{
"@id": 9412,
"authors": "Curators of the DSMZ",
"catalogue": "Leibniz Institut DSMZ-Deutsche Sammlung von Mikroorganismen und Zellkulturen GmbH (DSM 40258)",
"doi/url": "https://www.dsmz.de/collection/catalogue/details/culture/DSM-40258"
},
{
"@id": 18571,
"authors": "Wink, J.",
"title": "Compendium of Actinobacteria. HZI-Helmholtz-Centre for Infection Research, Braunschweig",
"doi/url": "http://www.dsmz.de/microorganisms/wink_pdf/DSM40258.pdf"
},
{
"@id": 20215,
"authors": "Parte, A.C., Sard\u00e0 Carbasse, J., Meier-Kolthoff, J.P., Reimer, L.C. and G\u00f6ker, M.",
"title": "List of Prokaryotic names with Standing in Nomenclature (LPSN) moves to the DSMZ",
"doi/url": "10.1099/ijsem.0.004332"
},
{
"@id": 20216,
"authors": "Curators of the HKI",
"title": "Collection Description Leibniz-Institut f\u00fcr Naturstoff-Forschung und Infektionsbiologie e. V. Hans-Kn\u00f6ll-Institut (HKI)",
"doi/url": "http://www.leibniz-hki.de/de/"
},
{
"@id": 20218,
"authors": "Verslyppe, B., De Smet, W., De Baets, B., De Vos, P., Dawyndt P.",
"title": "StrainInfo introduces electronic passports for microorganisms.",
"journal": "Syst Appl Microbiol. 37: 42-50 2014",
"doi/url": "10.1016/j.syapm.2013.11.002",
"pubmed": 24321274
},
{
"@id": 67770,
"authors": "Curators of the JCM",
"doi/url": "https://jcm.brc.riken.jp/en/"
},
{
"@id": 68368,
"title": "Automatically annotated from API 20E"
}
]
}
}
}
@jwokaty, Thanks! I'll convert them to "x_metabolite - TRUE" or "x_metabolite - FALSE".
Do you want me to do the change on the bacdive script? Then you only need to parse the result.
@jwokaty, what do you feel would be more convenient? I'm working on a few functions (this branch: https://github.com/waldronlab/bugphyzz/blob/sdgamboa/import-bacdive/R/bacdive.R) for formatting the excel document on google drive. It will include changing values, merging, and matching to previous known attributes, etc. As long as the document format doesn't change, it should work.
I'm concerned with maintenance. Every release, we'll get a new set of data from bacdive (I run my script) then we run your script to get that into bugphyzz. We'll soon do the same for the patric data. Bacdive updates and releases twice a year so the data could change. We should make the update process as easy as possible and minimize changes where we can. (It might also be nice to automate it in some way!)
I am not opposed to a separate script (it's probably necessary), but if there is something I can do to minimize those changes in my bacdive package to produce data that's easier for you to put into bugphyzz it might benefit us in the long run. I don't think you should have to workaround things that I have flexibility to change; I just took the data straight out of bacdive without knowing how to fit it into bugphyzz. I honestly just need guidance about how to improve the data for bugphyzz; you can tell me what you need.
For reference, here's my package to get bacdive data: https://github.com/jwokaty/BacDiveR.
@jwokaty, I agree. I'll prepare a document with the changes that I think are needed so far and could be solved in BacDiveR. I'll post the link to the document in this thread. (reopening the issue).
I created a document for following up on this issue, @jwokaty, @kbeckenrode.
related issue: #150
Some code that might be useful:
usrname <- Sys.getenv('BACDIVE_USERNAME')
pssword <- Sys.getenv('BACDIVE_PASSWORD')
url <- 'https://bacdive.dsmz.de/advsearch/csv'
bacdive <- utils::read.csv(url, skip = 2)[,c('ID', 'species')]
colnames(bacdive) <- c('BacDive_ID', 'Parent_name')
ao <- BacDive::open_bacdive(usrname, pssword)
bacdive_ids <- c('11', bacdive$BacDive_ID[1:10])
res <- BacDive::fetch(ao, bacdive_ids)$results
output <- res |>
purrr::map( ~ {
purrr::pluck(
.x, 'Physiology and metabolism', 'metabolite production'
)
}) |>
purrr::discard(is.null) |>
purrr::map(~ {
if (is.list(.x[[1]])) {
Attribute <- purrr::map_chr(.x, ~ purrr::pluck(.x, 'metabolite'))
Attribute_value <- purrr::map_chr(.x, ~ purrr::pluck(.x, 'production'))
Attribute_value <- ifelse(Attribute_value == 'yes', TRUE, FALSE)
} else {
Attribute <- purrr::pluck(.x, 'metabolite')
Attribute_value <- purrr::pluck(.x, 'production')
Attribute_value <- ifelse(Attribute_value == 'yes', TRUE, FALSE)
}
data.frame(Attribute = Attribute, Attribute_value = Attribute_value)
}) |>
dplyr::bind_rows(.id = 'BacDive_ID')
output
#> BacDive_ID Attribute Attribute_value
#> 1 145173 acetoin FALSE
#> 2 145172 acetoin FALSE
#> 3 145171 acetoin FALSE
#> 4 144952 acetoin FALSE
#> 5 219 indole FALSE
#> 6 219 acetoin FALSE
#> 7 11 2-dehydro-D-gluconate FALSE
#> 8 11 cellulose FALSE
#> 9 11 dihydroxyacetone TRUE
Created on 2022-12-19 with reprex v2.0.2
@sdgamboa Where can I find the document?
I created a document for following up on this issue, @jwokaty, @kbeckenrode.
I think this is solved now (https://github.com/waldronlab/bugphyzz/blob/main/R/bacdive.R)
In the metabolism production column, there are values of yes and no alongside the metabolites. I thought this meant if the metabolite was produced or not by the bacteria, but I wasn't able to get confirmation from the bacdive website. On the website, I don't see 'yes' or 'no' with the metabolites (metabolism production section). Could @kbeckenrode or @jwokaty confirm the meaning?
An example: https://bacdive.dsmz.de/strain/15070
Created on 2022-12-15 with reprex v2.0.2