microbiomedata / nmdc-schema

National Microbiome Data Collaborative (NMDC) unified data model
https://microbiomedata.github.io/nmdc-schema/
Creative Commons Zero v1.0 Universal
27 stars 8 forks source link

Assess what namespaces/ontologies/prefixes are being used in the data #1112

Closed turbomam closed 1 year ago

turbomam commented 1 year ago

see also

turbomam commented 1 year ago
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
select 
?s ?t ?p ?o
where {
    graph <https://w3id.org/nmdc/nmdc>
    {
        ?p rdfs:range ?r.
        ?r rdfs:subClassOf* <https://w3id.org/linkml/Uriorcurie> .
    }
    graph <mongodb://mongo-loadbalancer.nmdc.production.svc.spin.nersc.gov:27017> {
        ?s a ?t ;
           ?p ?o
    }
}
turbomam commented 1 year ago

This has uncovered several problems in the data. I think most of them predate me, but I could have made made the situation worse by just silently making my own workarounds

_This table shows prefixes and expansions for all nmdc-schema-defined MongoDB collections, except for functional_annotation_agg and metaproteomics_analysis_activity_set_

URI base schema-asserted prefix mapping count
http://identifiers.org/cas/  CAS 103212
https://example.org/kegg/   83327
http://identifiers.org/gold/  GOLD 3060
https://example.org/img.taxon/ img.taxon  1890
https://example.org/biosample/ BIOSAMPLE  1091
urn:uuid:   134
https://example.org/gold/  GOLD2 93
https://example.org/doi/   21
https://bioregistry.io/gnps.task: gnps.task  4
https://bioregistry.io/jgi.proposal: jgi.proposal  1
http://example.com/   1
https://bioregistry.io/reference/doi: doi  0
https://doi.org/ doi3  0
http://identifiers.org/kegg.compound/ KEGG.COMPOUND  0
http://identifiers.org/kegg.orthology/ KEGG.ORTHOLOGY  0
http://identifiers.org/kegg.reaction/ KEGG.REACTION  0
http://identifiers.org/kegg.pathway/ KEGG_PATHWAY  0
http://identifiers.org/gold/ GOLD  0

Notes

turbomam commented 1 year ago
    "type" : "nmdc:MetabolomicsAnalysisActivity",
    "has_input" : [
        "emsl:output_499639"
    ],
    "has_output" : [
        "nmdc:1f52814294dda994bf2f864d945b1be8"
    ],
    "id" : "nmdc:cdee3cc8648d43fbbcfd488247ba1887",
    "ended_at_time" : "2021-01-08T10:14:51Z",
    "execution_resource" : "EMSL-RZR",
    "git_url" : "https://github.com/microbiomedata/metaMS",
    "has_calibration" : "emsl:output_499616",
    "started_at_time" : "2021-01-08T10:14:51Z",
    "used" : "Agilent_GC_MS",
    "was_informed_by" : "emsl:499639",
    "has_metabolite_quantifications" : [
        {
            "highest_similarity_score" : 0.9534156546099186,
            "metabolite_quantified" : "chebi:16997",
            "alternative_identifiers" : [
                "kegg:C00583",
                "cas:57-55-6"
            ]
        },
{
  "geo_loc_name": {
    "has_raw_value": "USA: California, Anza Borrego Desert State Park"
  },
  "growth_facil": {
    "has_raw_value": "field",
    "term": {
      "name": "field",
      "id": "ENVO:01000352"
    }
  },
  "env_broad_scale": {
    "has_raw_value": "__temperate shrubland biome [ENVO:01000215]",
    "term": {
      "id": "ENVO:01000215",
      "name": "temperate shrubland biome "
    }
  },
  "env_medium": {
    "has_raw_value": "bare soil [ENVO:01001616]",
    "term": {
      "id": "ENVO:01001616",
      "name": "bare soil "
    }
  },
  "micro_biomass_meth": "Chloroform fumigation direct extraction",
  "store_cond": {
    "has_raw_value": "frozen"
  },
  "ecosystem_type": "Soil",
  "name": "1000 soils - ANZA_CoreB_BTM",
  "depth": {
    "has_minimum_numeric_value": 0.2,
    "has_maximum_numeric_value": 0.3
  },
  "env_package": {
    "has_raw_value": "soil"
  },
  "ecosystem": "Environmental",
  "lat_lon": {
    "has_raw_value": "33.305325, -116.254011",
    "latitude": 33.305325,
    "longitude": -116.254011
  },
  "samp_name": "ANZA_CoreB_BTM",
  "env_local_scale": {
    "has_raw_value": "__dry valley [ENVO:00000128]",
    "term": {
      "id": "ENVO:00000128",
      "name": "dry valley "
    }
  },
  "collection_date": {
    "has_raw_value": "2022-06-06T00:00:00"
  },
  "id": "nmdc:bsm-11-yj9yav68",
  "ecosystem_subtype": "Unclassified",
  "water_content": [
    "0.99970423 FW/DW"
  ],
  "elev": 191,
  "samp_collec_method": "Kit 6",
  "samp_store_temp": {
    "has_raw_value": "-80 Celcius",
    "has_unit": "Celcius",
    "has_numeric_value": -80
  },
  "cur_vegetation": {
    "has_raw_value": "Desert scrub"
  },
  "carb_nitro_ratio": {
    "has_raw_value": "3.213"
  },
  "part_of": [
    "nmdc:sty-11-28tm5d36"
  ],
  "sieving": {
    "has_raw_value": "4mm sieved"
  },
  "ecosystem_category": "Terrestrial",
  "analysis_type": [
    "metagenomics",
    "natural organic matter"
  ],
  "collection_time": "08:45  - 12:05",
  "emsl_biosample_identifiers": [
    "UUID:ANZA-CB-B-6bc68e7d-7383-4bbe-9789-2f897ab426ca"
  ],
  "specific_ecosystem": "Unclassified",
  "ph": 9.2,
  "tot_phosp": {
    "has_raw_value": "0.1 ppm",
    "has_unit": "ppm",
    "has_numeric_value": 0.1
  }
}
{
  "id": "nmdc:sty-11-8xdqsn54",
  "description": "This User proposal ... from soils to aquatic systems.",
  "type": "nmdc:Study",
  "has_credit_associations": [
    {
      "applies_to_person": {
        "name": "Michael SanClements",
        "orcid": "orcid:0000-0002-1962-3561"
      },
      "applied_roles": [
        "Principal Investigator"
      ]
    },
    {
      "applies_to_person": {
        "name": "Margaret Bowman",
        "orcid": "orcid:0000-0001-8825-1098"
      },
      "applied_roles": [
        "Investigation"
      ]
    }
  ],
  "name": "...",
  "title": "...",
  "funding_sources": [
    "..."
  ],
  "award_dois": [
    "doi:10.46936/lser.proj.2019.50718/60006575"
  ],
  "websites": [
    "https://www.emsl.pnnl.gov/project/50718",
    "https://iscn.fluxdata.org/a-case-study-in-collaborative-research-soil-organic-matter-mechanisms-of-stabilization-som-mos/",
    "https://doi.org/10.6073/pasta/4d5f03a4619e834c031ab4a6a121de12"
  ],
  "principal_investigator": {
    "name": "Michael SanClements",
    "orcid": "orcid:0000-0002-1962-3561",
    "profile_image_url": "https://portal.nersc.gov/project/m3408/profile_images/SanClements_Michael.jpg",
    "email": "sanclements@battelle.org"
  }
{
  "ecosystem": "Environmental",
  "insdc_biosample_identifiers": [
    "biosample:SAMN10864388"
  ],
  "description": "Bulk soil microbial communities from the East River watershed near Crested Butte, Colorado, United States",
  "type": "nmdc:Biosample",
  "ecosystem_category": "Terrestrial",
  "alternative_identifiers": [
    "gold:Gb0191643",
    "gold:Gb0205601",
    "img.taxon:3300042813"
  ],
  "samp_name": "ER_115",
  "add_date": "2018-06-22",
  "location": "The East River watershed near Crested Butte, Colorado, USA",
  "name": "Bulk soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - ER_115",
  "part_of": [
    "gold:Gs0135149"
  ],
  "depth": {
    "has_raw_value": "0.0",
    "has_numeric_value": 0,
    "has_unit": "meter"
  },
  "sample_collection_site": "soil",
  "ecosystem_type": "Soil",
  "env_medium": {
    "has_raw_value": "ENVO:00005802",
    "term": {
      "id": "ENVO:00005802"
    }
  },
  "ecosystem_subtype": "Meadow",
  "habitat": "bulk soil",
  "lat_lon": {
    "has_raw_value": "38.917216053 -106.9559947",
    "latitude": 38.917216053,
    "longitude": -106.9559947
  },
  "env_broad_scale": {
    "has_raw_value": "ENVO:00000108",
    "term": {
      "id": "ENVO:00000108"
    }
  },
  "id": "igsn:IEWFS0001",
  "env_local_scale": {
    "has_raw_value": "ENVO:00000292",
    "term": {
      "id": "ENVO:00000292"
    }
  },
  "gold_biosample_identifiers": [
    "GOLD:Gb0191643"
  ],
  "mod_date": "2021-06-15",
  "specific_ecosystem": "Bulk soil",
  "ncbi_taxonomy_name": "soil metagenome",
  "geo_loc_name": {
    "has_raw_value": "USA: Colorado"
  },
  "community": "microbial communities",
  "collection_date": {
    "has_raw_value": "2017-03-07"
  }
}