gbif / model-tests

Exploration of sample models
2 stars 0 forks source link

Full query for sample specimen #18

Open MortenHofft opened 2 years ago

MortenHofft commented 2 years ago

In an attempt to get all data from https://arctos.database.museum/guid/DMNS:Mamm:11098 I ended up with below query. that could get to most of it.

See screenshot for the parts I was not able to find.

DMNS-Mamm-11098-excluded

query {
  # the entity can have many IDs, so we need to ask for the entity through an entity identifiers table
  specimensIDs: allEntityIdentifiers(condition: {
    entityIdentifier: "https://arctos.database.museum/guid/DMNS:Mamm:11098"
  }) {
    # there should only be one entity with this ID
    nodes {
      entityId
      entityIdentifier
      entityIdentifierType
      specimen: entityByEntityId {
        entityId
        entityType

        # get identifications
        materialEntityByMaterialEntityId {
          materialEntityType
          identificationMaterialsByMaterialEntityId {
            totalCount
            nodes {
              identificationByIdentificationId {
                taxonIdentificationsByIdentificationId {
                  totalCount
                  nodes {
                    taxonByTaxonId {
                      scientificName
                      kingdom
                      phylum
                      class
                      order
                      family
                      subfamily
                      # what happened to genus?
                      genericName
                      specificEpithet
                      infraspecificEpithet
                      scientifcNameAuthorship
                      parentTaxonId # this should be linked in the graph as well
                      # The UI has a longer classification, but I assume that is just because they use a different taxonomy?
                    }
                  }
                }

                verbatimIdentification
                # vernacular name: Colorado chipmunk is not in the data
                dateIdentified
                identificationAgentRolesByIdentificationId {
                  totalCount
                  nodes {
                    agentId
                    identificationAgentRole
                    identificationAgentRoleBegan
                    identificationAgentRoleEnded
                    agentByAgentId {
                      preferredAgentName
                    }
                  }
                }

                identificationType
                identificationVerificationStatus
                identificationRemarks

                taxaFormula # value : A - I do not know what it means, but perhaps meaningful
                isAcceptedIdentification

                # Citations section. I'm surprised this is sitting on the Identification. I would have thought it was attached to the specimen
                identificationCitationsByIdentificationId {
                  nodes {
                    citationType
                    citationPageNumber
                    citationRemarks
                    # The citations reference the species name, but since they hangs of the Identification it must come from there.
                    referenceByReferenceId {
                      referenceType
                      referenceDoi # I do not know enough about this, but is DOIs the only thing that is used to link?
                      bibliographicCitation # The UI shows like "Bell et al. 2015" - but I assume that is just something that parse the full string?
                    }
                  }
                }
              }
            }
          }
        }

        # collector and preparator
        entityAgentRolesByEntityId {
          totalCount
          nodes {
            agentId
            entityAgentRole
            entityAgentRoleBegan
            entityAgentRoleEnded
            entityAgentRoleOrder
            agentByAgentId {
              preferredAgentName
            }
          }
        }

        # facts and meassurements about the specimen
        entityAssertionsByEntityId {
          totalCount
          nodes {
            entityAssertionType
            entityAssertionValue
            entityAssertionValueNumeric
            entityAssertionUnit
            entityAssertionDate
            entityAssertionProtocol
            entityAssertionRemarks
            agentByEntityAssertionByAgentId {
              preferredAgentName
            }
          }
        }

        # Identifiers section 
        # This is missing 2/5 columns: "relationsship" and "ID value" and "assignedBy" is missing. But might be inferred from the others?
        entityIdentifiersByEntityId {
          nodes {
            entityIdentifier
            entityIdentifierType
          }
        }
        # above identifiers include the parasites
        # I can also get to those via the related entities path like below
        entityRelationshipsBySubjectEntityId {
          nodes {
            externalObjectEntityId
          }
        }

        # Get media linked to this Cataloged Item.
        # The condition type is currently just the arctos free text, but I assume it would have a vocab
        # Search for entities with a relationsship to the current item. And filter on media of the catalogued item
        mediaItems: entityRelationshipsByObjectEntityId(condition: {
          entityRelationshipType: "shows cataloged_item" 
        }) {
          list: nodes {
            entityBySubjectEntityId {
              digitalEntityByDigitalEntityId {
                accessUri
                format
                webStatement
                digitalEntityType
                digitalEntityAssertionsByDigitalEntityId {
                  nodes {
                    digitalEntityAssertionType
                    digitalEntityAssertionValue
                    digitalEntityAssertionValueNumeric
                    digitalEntityAssertionUnit
                    digitalEntityAssertionUnit
                    digitalEntityAssertionDate
                    digitalEntityAssertionProtocol
                    digitalEntityAssertionRemarks
                  }
                }
              }
            }
          }
        }
        # media section END

        # get the event data
        entityEventsByEntityId {
          totalCount
          nodes {
            eventByEventId {
              eventType
              # I'm unable to find the agent that appears in the arctos site
              # verification status is left out it seems
              # collection source: wild is not to be found either
              eventDate
              verbatimEventDate

              locationByLocationId {
                higherGeography
                locationAccordingTo # in the UI this is attached to the higher Geography which I think it also what the source is about
                locality
              }
              verbatimLocality
              #Associated Names - i do not see the data anywhere. Perhaps left out?

              locationByLocationId {
                georeferencesByLocationId { # I'm surprised to get back a list of georeferences for my location. How Do I choose?
                  nodes {
                    decimalLatitude
                    decimalLongitude
                    preferredSpatialRepresentation # I guess this is primary_spatial_data: point-radius
                    geodeticDatum
                    coordinateUncertaintyInMeters
                    georeferenceSources # why is this plural?
                    georeferenceProtocol
                  }
                }
                minimumElevationInMeters
                maximumElevationInMeters
              }
              verbatimLatitude # filled with collectionMethod
              verbatimLongitude # filled with habitat

              # collectionMethod not there # "verbatimLatitude": "Sherman trap" - I see it in latitude though
              habitat # not filled 

              # list of images from event/place - those I cannot figure out how to get to. See https://github.com/timrobertson100/model-tests/issues/11

            }
          }
        }
        # event section END

        # Parts section
        entityRelationshipsByObjectEntityId(condition: {
          entityRelationshipType: "part of"
        }) {
          nodes {
            entityBySubjectEntityId {
              entityType
              entityId
              digitalEntityByDigitalEntityId {
                digitalEntityId
              }
              materialEntityByMaterialEntityId {
                materialEntityType
                collectionByCollectionId {
                  institutionCode
                }
                materialEntityAssertionsByMaterialEntityId {
                  nodes {
                    materialEntityAssertionType
                    materialEntityAssertionValue
                    materialEntityAssertionValueNumeric
                    materialEntityAssertionUnit
                    materialEntityAssertionProtocol
                    materialEntityAssertionRemarks
                    materialEntityAssertionDate
                    agentByMaterialEntityAssertionByAgentId {
                      preferredAgentName
                    }
                  }
                }
              }
            }
          }
        }
        # Parts section END

      }
    }
  }
}
timrobertson100 commented 2 years ago

@tucotuco - can you please review the screenshot and confirm this is what you would expect?

tucotuco commented 2 years ago

Going through as top to bottom as possible...

Unexpected: We do have Darwin Core taxon classification fields in taxon.csv.

Expected: We do not have the external links that Arctos produces in the UI.

Expected: Event type assignment, date and Verification Status. Arctos gave it to us, and we could make an AgentRole for it, but I didn't think it had any bearing on science. In Arctos this is to track who to blame if something is wrong.

Expected: Darwin Core and our model doe not separate HigherGeography from Location, and the source is for the HigherGeography.

Expected: We do not have the Expected Names from Arctos, nor do we model that.

Expected: We do not have images of anything except Entities in the model, and Locations are not modeled as Entities.

Expected: We do not have Accession information (in the sense of the the act of taking legal title to material) from Arctos, nor is it in the model.

Expected: We do not have Usage information from Arctos, but this could be the basis for a VERY interesting use case, especially if combined with funding sources.

Unexpected: We do have all of the relationships shown in the full view, but we interpret "self" relationships as identifiers, not as relationships. The "full/split view" for that section is how both Arctos and the model treat those data.

Unexpected: We do have the ID Values for both the Identifiers and for the relationships in entityIdentifier.

Expected: We do not have the Assigned By information from Arctos, nor in the model.

Expected: Part IDs were not shared by Arctos. Part IDs from Arctos are advertised as not persistent.

Finally, the remaining details about parts (disposition, lot count, condition, and remarks) were not shared by Arctos originally, but I asked for that information and it is now available in material_entity_assertion.csv via https://github.com/timrobertson100/model-tests/commit/4811c1a0dca3d2eb2f21b7b478064a2e3a310433. I added the remarks to Assertions only temporarily, as I think there are some model changes that would make this better and I did not want to mess with the model for the demo without talking about it first.

MortenHofft commented 2 years ago

Thanks - looks like it is all intended except for a few that sounds like it is misunderstandings on my part.

I'm just going to address those that where I have a comment


Unexpected: We do have Darwin Core taxon classification fields in taxon.csv.

The red line is just there, because it wasn't the exact same data, but a subset of the fields. But I understand that is on purpose. You answered this in https://github.com/timrobertson100/model-tests/issues/13#issuecomment-1156836671


Unexpected: We do have the ID Values for both the Identifiers and for the relationships in entityIdentifier.

What I mean is simply that in the entityIdentifer csv the link column and the ID-Value columns are merged into one (as far as I can see).

line entityID entityIdentifier entityIdentifierType
20 21714980 JRD423 collector number
22 21714980 https://datadryad.org/stash/dataset/doi:10.5061/dryad.52mp1 Dryad DOI
23 21714980 http://www.ncbi.nlm.nih.gov/nuccore/KJ139497 GenBank

Which is slightly different from the UI, but if one has a vocabulary I guess you can parse it. And secondly it might not matter much. I'm just pointing out the obvious here: that the 2 columns are merged into one. And that it might be more natural to keep the IDs and then template the URL based in the type rather than parsing the url to get the value. Either way, the UI and the csv disagree on what is the ID as far as I can understand (is http://www.ncbi.nlm.nih.gov/nuccore/KJ139497 the id or is KJ139497 the ID?)

Screenshot 2022-06-17 at 09 53 44
tucotuco commented 2 years ago

Unexpected: We do have the ID Values for both the Identifiers and for the relationships in entityIdentifier.

What I mean is simply that in the entityIdentifer csv the link column and the ID-Value columns are merged into one (as far as I can see).

line entityID entityIdentifier entityIdentifierType 20 21714980 JRD423 collector number 22 21714980 https://datadryad.org/stash/dataset/doi:10.5061/dryad.52mp1 Dryad DOI 23 21714980 http://www.ncbi.nlm.nih.gov/nuccore/KJ139497 GenBank Which is slightly different from the UI, but if one has a vocabulary I guess you can parse it. And secondly it might not matter much. I'm just pointing out the obvious here: that the 2 columns are merged into one. And that it might be more natural to keep the IDs and then template the URL based in the type rather than parsing the url to get the value. Either way, the UI and the csv disagree on what is the ID as far as I can understand (is http://www.ncbi.nlm.nih.gov/nuccore/KJ139497 the id or is KJ139497 the ID?) Screenshot 2022-06-17 at 09 53 44

@MortenHofft OK, I understand. Arctos uses the atomized (parsed) values as separate fields to aid in searching, but I opted not to map those separately as it does not add anything in the current context.

timrobertson100 commented 2 years ago

I think updating this for arctos_v2 becomes this, @MortenHofft :

query {
  # the entity can have many IDs, so we need to ask for the entity through an entity identifiers table
  specimensIDs: allEntityIdentifiers(condition: {
    entityIdentifier: "https://arctos.database.museum/guid/DMNS:Mamm:11098"
  }) {
    # there should only be one entity with this ID
    nodes {
      entityId
      entityIdentifier
      entityIdentifierType
      specimen: entityByEntityId {
        entityId
        entityType

        # get identifications
        identificationEntitiesByEntityId {
            totalCount
            nodes {
              identificationByIdentificationId {
                taxonIdentificationsByIdentificationId {
                  totalCount
                  nodes {
                    taxonByTaxonId {
                      scientificName
                      kingdom
                      phylum
                      class
                      order
                      family
                      subfamily
                      # what happened to genus?
                      genericName
                      specificEpithet
                      infraspecificEpithet
                      scientifcNameAuthorship
                      parentTaxonId # this should be linked in the graph as well
                      # The UI has a longer classification, but I assume that is just because they use a different taxonomy?
                    }
                  }
                }

                verbatimIdentification
                # vernacular name: Colorado chipmunk is not in the data
                dateIdentified
                identificationAgentRolesByIdentificationId {
                  totalCount
                  nodes {
                    agentId
                    identificationAgentRole
                    identificationAgentRoleBegan
                    identificationAgentRoleEnded
                    agentByAgentId {
                      preferredAgentName
                    }
                  }
                }

                identificationType
                identificationVerificationStatus
                identificationRemarks

                taxaFormula # value : A - I do not know what it means, but perhaps meaningful
                isAcceptedIdentification

                # Citations section. I'm surprised this is sitting on the Identification. I would have thought it was attached to the specimen
                identificationCitationsByIdentificationId {
                  nodes {
                    citationType
                    citationPageNumber
                    citationRemarks
                    # The citations reference the species name, but since they hangs of the Identification it must come from there.
                    referenceByReferenceId {
                      referenceType
                      referenceDoi # I do not know enough about this, but is DOIs the only thing that is used to link?
                      bibliographicCitation # The UI shows like "Bell et al. 2015" - but I assume that is just something that parse the full string?
                    }
                  }
                }
              }
            }
          }

        # typed data for material
        materialEntityByMaterialEntityId {
          materialEntityType         
        }

        # collector and preparator
        entityAgentRolesByEntityId {
          totalCount
          nodes {
            agentId
            entityAgentRole
            entityAgentRoleBegan
            entityAgentRoleEnded
            entityAgentRoleOrder
            agentByAgentId {
              preferredAgentName
            }
          }
        }

        # facts and meassurements about the specimen
        entityAssertionsByEntityId {
          totalCount
          nodes {
            entityAssertionType
            entityAssertionValue
            entityAssertionValueNumeric
            entityAssertionUnit
            entityAssertionDate
            entityAssertionProtocol
            entityAssertionRemarks
            agentByEntityAssertionByAgentId {
              preferredAgentName
            }
          }
        }

        # Identifiers section 
        # This is missing 2/5 columns: "relationsship" and "ID value" and "assignedBy" is missing. But might be inferred from the others?
        entityIdentifiersByEntityId {
          nodes {
            entityIdentifier
            entityIdentifierType
          }
        }
        # above identifiers include the parasites
        # I can also get to those via the related entities path like below
        entityRelationshipsBySubjectEntityId {
          nodes {
            externalObjectEntityId
          }
        }

        # Get media linked to this Cataloged Item.
        # The condition type is currently just the arctos free text, but I assume it would have a vocab
        # Search for entities with a relationsship to the current item. And filter on media of the catalogued item
        mediaItems: entityRelationshipsByObjectEntityId(condition: {
          entityRelationshipType: "shows cataloged_item" 
        }) {
          list: nodes {
            entityBySubjectEntityId {

              entityAssertionsByEntityId {
                  nodes {
                    entityAssertionType
                    entityAssertionValue
                    entityAssertionValueNumeric
                    entityAssertionUnit
                    entityAssertionUnit
                    entityAssertionDate
                    entityAssertionProtocol
                    entityAssertionRemarks
                  }                
              }

              # typed data
              digitalEntityByDigitalEntityId {
                accessUri
                format
                webStatement
                digitalEntityType
              }
            }
          }
        }
        # media section END

        # get the event data
        entityEventsByEntityId {
          totalCount
          nodes {
            eventByEventId {
              eventType
              # I'm unable to find the agent that appears in the arctos site
              # verification status is left out it seems
              # collection source: wild is not to be found either
              eventDate
              verbatimEventDate

              locationByLocationId {
                higherGeography
                locationAccordingTo # in the UI this is attached to the higher Geography which I think it also what the source is about
                locality
              }
              verbatimLocality
              #Associated Names - i do not see the data anywhere. Perhaps left out?

              locationByLocationId {
                georeferencesByLocationId { # I'm surprised to get back a list of georeferences for my location. How Do I choose?
                  nodes {
                    decimalLatitude
                    decimalLongitude
                    preferredSpatialRepresentation # I guess this is primary_spatial_data: point-radius
                    geodeticDatum
                    coordinateUncertaintyInMeters
                    georeferenceSources # why is this plural?
                    georeferenceProtocol
                  }
                }
                minimumElevationInMeters
                maximumElevationInMeters
              }
              verbatimLatitude # filled with collectionMethod
              verbatimLongitude # filled with habitat

              # collectionMethod not there # "verbatimLatitude": "Sherman trap" - I see it in latitude though
              habitat # not filled 

              # list of images from event/place - those I cannot figure out how to get to. See https://github.com/timrobertson100/model-tests/issues/11

            }
          }
        }
        # event section END

        # Parts section
        entityRelationshipsByObjectEntityId(condition: {
          entityRelationshipType: "part of"
        }) {
          nodes {
            entityBySubjectEntityId {
              entityType
              entityId
              digitalEntityByDigitalEntityId {
                digitalEntityId
              }
              collectionByCollectionId {
                institutionCode
              }

              entityAssertionsByEntityId {
                nodes {
                  entityAssertionType
                  entityAssertionValue
                  entityAssertionValueNumeric
                  entityAssertionUnit
                  entityAssertionProtocol
                  entityAssertionRemarks
                  entityAssertionDate
                  agentByEntityAssertionByAgentId {
                    preferredAgentName
                  }
                }
              }

              #typed data
              materialEntityByMaterialEntityId {
                materialEntityType
              }
            }
          }
        }
        # Parts section END

      }
    }
  }
}