Closed turbomam closed 1 year ago
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
select
?s ?t ?p ?o
where {
graph <https://w3id.org/nmdc/nmdc>
{
?p rdfs:range ?r.
?r rdfs:subClassOf* <https://w3id.org/linkml/Uriorcurie> .
}
graph <mongodb://mongo-loadbalancer.nmdc.production.svc.spin.nersc.gov:27017> {
?s a ?t ;
?p ?o
}
}
This has uncovered several problems in the data. I think most of them predate me, but I could have made made the situation worse by just silently making my own workarounds
_This table shows prefixes and expansions for all nmdc-schema-defined MongoDB collections, except for functional_annotation_agg
and metaproteomics_analysis_activity_set
_
URI base | schema-asserted prefix | mapping count |
---|---|---|
http://identifiers.org/cas/ | CAS | 103212 |
https://example.org/kegg/ | 83327 | |
http://identifiers.org/gold/ | GOLD | 3060 |
https://example.org/img.taxon/ | img.taxon | 1890 |
https://example.org/biosample/ | BIOSAMPLE | 1091 |
urn:uuid: | 134 | |
https://example.org/gold/ | GOLD2 | 93 |
https://example.org/doi/ | 21 | |
https://bioregistry.io/gnps.task: | gnps.task | 4 |
https://bioregistry.io/jgi.proposal: | jgi.proposal | 1 |
http://example.com/ | 1 | |
https://bioregistry.io/reference/doi: | doi | 0 |
https://doi.org/ | doi3 | 0 |
http://identifiers.org/kegg.compound/ | KEGG.COMPOUND | 0 |
http://identifiers.org/kegg.orthology/ | KEGG.ORTHOLOGY | 0 |
http://identifiers.org/kegg.reaction/ | KEGG.REACTION | 0 |
http://identifiers.org/kegg.pathway/ | KEGG_PATHWAY | 0 |
http://identifiers.org/gold/ | GOLD | 0 |
<https://example.org/kegg/>
base is associated with (lower-case, overly-general) kegg
in assets/misc/extra_prefix_expansions.yaml
nmdc:MetaboliteQuantification
nmdc:cdee3cc8648d43fbbcfd488247ba1887
has alternative_identifiers
kegg:C00583
.nmdc_schema/anyuri_strings_to_iris.py
Biosample
nmdc:bsm-11-yj9yav68
emsl_biosample_identifiers
UUID:ANZA-CB-B-6bc68e7d-7383-4bbe-9789-2f897ab426ca
. This is special case because the correct representation of UUIDs doesn't use a prefix.<https://example.org/doi/>
in the RDF version of the data yet. There's two other expansions that are both better.<https://example.org/gold/>
is associated with the prefix GOLD2
nmdc:Biosample
igsn:IEWFS0001
nmdc:alternative_identifiers
(lower-case) gold:Gb0191643
KEGG_PATHWAY
doesn't use .
as a separator, as all of the other KEGG prefixes do.make gen-project
should include all prefixes asserted in any of the modules in the schema directory, but it doesn't seem to be picking up the prefixes from imported modules like GOLD
in external_identifiers.yaml
gen-prefix-map --output nmdc_merged_prefixes.json --mergeimports nmdc_schema/nmdc_schema_merged.yaml
gen-project
writes JSON content to a file with the .yaml extensionemit_prefixes
? "type" : "nmdc:MetabolomicsAnalysisActivity",
"has_input" : [
"emsl:output_499639"
],
"has_output" : [
"nmdc:1f52814294dda994bf2f864d945b1be8"
],
"id" : "nmdc:cdee3cc8648d43fbbcfd488247ba1887",
"ended_at_time" : "2021-01-08T10:14:51Z",
"execution_resource" : "EMSL-RZR",
"git_url" : "https://github.com/microbiomedata/metaMS",
"has_calibration" : "emsl:output_499616",
"started_at_time" : "2021-01-08T10:14:51Z",
"used" : "Agilent_GC_MS",
"was_informed_by" : "emsl:499639",
"has_metabolite_quantifications" : [
{
"highest_similarity_score" : 0.9534156546099186,
"metabolite_quantified" : "chebi:16997",
"alternative_identifiers" : [
"kegg:C00583",
"cas:57-55-6"
]
},
{
"geo_loc_name": {
"has_raw_value": "USA: California, Anza Borrego Desert State Park"
},
"growth_facil": {
"has_raw_value": "field",
"term": {
"name": "field",
"id": "ENVO:01000352"
}
},
"env_broad_scale": {
"has_raw_value": "__temperate shrubland biome [ENVO:01000215]",
"term": {
"id": "ENVO:01000215",
"name": "temperate shrubland biome "
}
},
"env_medium": {
"has_raw_value": "bare soil [ENVO:01001616]",
"term": {
"id": "ENVO:01001616",
"name": "bare soil "
}
},
"micro_biomass_meth": "Chloroform fumigation direct extraction",
"store_cond": {
"has_raw_value": "frozen"
},
"ecosystem_type": "Soil",
"name": "1000 soils - ANZA_CoreB_BTM",
"depth": {
"has_minimum_numeric_value": 0.2,
"has_maximum_numeric_value": 0.3
},
"env_package": {
"has_raw_value": "soil"
},
"ecosystem": "Environmental",
"lat_lon": {
"has_raw_value": "33.305325, -116.254011",
"latitude": 33.305325,
"longitude": -116.254011
},
"samp_name": "ANZA_CoreB_BTM",
"env_local_scale": {
"has_raw_value": "__dry valley [ENVO:00000128]",
"term": {
"id": "ENVO:00000128",
"name": "dry valley "
}
},
"collection_date": {
"has_raw_value": "2022-06-06T00:00:00"
},
"id": "nmdc:bsm-11-yj9yav68",
"ecosystem_subtype": "Unclassified",
"water_content": [
"0.99970423 FW/DW"
],
"elev": 191,
"samp_collec_method": "Kit 6",
"samp_store_temp": {
"has_raw_value": "-80 Celcius",
"has_unit": "Celcius",
"has_numeric_value": -80
},
"cur_vegetation": {
"has_raw_value": "Desert scrub"
},
"carb_nitro_ratio": {
"has_raw_value": "3.213"
},
"part_of": [
"nmdc:sty-11-28tm5d36"
],
"sieving": {
"has_raw_value": "4mm sieved"
},
"ecosystem_category": "Terrestrial",
"analysis_type": [
"metagenomics",
"natural organic matter"
],
"collection_time": "08:45 - 12:05",
"emsl_biosample_identifiers": [
"UUID:ANZA-CB-B-6bc68e7d-7383-4bbe-9789-2f897ab426ca"
],
"specific_ecosystem": "Unclassified",
"ph": 9.2,
"tot_phosp": {
"has_raw_value": "0.1 ppm",
"has_unit": "ppm",
"has_numeric_value": 0.1
}
}
{
"id": "nmdc:sty-11-8xdqsn54",
"description": "This User proposal ... from soils to aquatic systems.",
"type": "nmdc:Study",
"has_credit_associations": [
{
"applies_to_person": {
"name": "Michael SanClements",
"orcid": "orcid:0000-0002-1962-3561"
},
"applied_roles": [
"Principal Investigator"
]
},
{
"applies_to_person": {
"name": "Margaret Bowman",
"orcid": "orcid:0000-0001-8825-1098"
},
"applied_roles": [
"Investigation"
]
}
],
"name": "...",
"title": "...",
"funding_sources": [
"..."
],
"award_dois": [
"doi:10.46936/lser.proj.2019.50718/60006575"
],
"websites": [
"https://www.emsl.pnnl.gov/project/50718",
"https://iscn.fluxdata.org/a-case-study-in-collaborative-research-soil-organic-matter-mechanisms-of-stabilization-som-mos/",
"https://doi.org/10.6073/pasta/4d5f03a4619e834c031ab4a6a121de12"
],
"principal_investigator": {
"name": "Michael SanClements",
"orcid": "orcid:0000-0002-1962-3561",
"profile_image_url": "https://portal.nersc.gov/project/m3408/profile_images/SanClements_Michael.jpg",
"email": "sanclements@battelle.org"
}
{
"ecosystem": "Environmental",
"insdc_biosample_identifiers": [
"biosample:SAMN10864388"
],
"description": "Bulk soil microbial communities from the East River watershed near Crested Butte, Colorado, United States",
"type": "nmdc:Biosample",
"ecosystem_category": "Terrestrial",
"alternative_identifiers": [
"gold:Gb0191643",
"gold:Gb0205601",
"img.taxon:3300042813"
],
"samp_name": "ER_115",
"add_date": "2018-06-22",
"location": "The East River watershed near Crested Butte, Colorado, USA",
"name": "Bulk soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - ER_115",
"part_of": [
"gold:Gs0135149"
],
"depth": {
"has_raw_value": "0.0",
"has_numeric_value": 0,
"has_unit": "meter"
},
"sample_collection_site": "soil",
"ecosystem_type": "Soil",
"env_medium": {
"has_raw_value": "ENVO:00005802",
"term": {
"id": "ENVO:00005802"
}
},
"ecosystem_subtype": "Meadow",
"habitat": "bulk soil",
"lat_lon": {
"has_raw_value": "38.917216053 -106.9559947",
"latitude": 38.917216053,
"longitude": -106.9559947
},
"env_broad_scale": {
"has_raw_value": "ENVO:00000108",
"term": {
"id": "ENVO:00000108"
}
},
"id": "igsn:IEWFS0001",
"env_local_scale": {
"has_raw_value": "ENVO:00000292",
"term": {
"id": "ENVO:00000292"
}
},
"gold_biosample_identifiers": [
"GOLD:Gb0191643"
],
"mod_date": "2021-06-15",
"specific_ecosystem": "Bulk soil",
"ncbi_taxonomy_name": "soil metagenome",
"geo_loc_name": {
"has_raw_value": "USA: Colorado"
},
"community": "microbial communities",
"collection_date": {
"has_raw_value": "2017-03-07"
}
}
see also
1111