microbiomedata / nmdc-schema

National Microbiome Data Collaborative (NMDC) unified data model
https://microbiomedata.github.io/nmdc-schema/
Creative Commons Zero v1.0 Universal
27 stars 8 forks source link

reconcile `id_prefixes` and `pattern`s #1125

Closed turbomam closed 1 year ago

turbomam commented 1 year ago
import pprint

from linkml_runtime import SchemaView

schema_file = '../nmdc_schema/nmdc_schema_merged.yaml'

schema_view = SchemaView(schema_file)

schema_elements = schema_view.all_elements()

constraints_dict = {}
for ek, ev in schema_elements.items():
    et = type(ev).__name__
    current_key = f"{et} {ek}"
    if '_at_time' in ek:
        continue
    if 'id_prefixes' in ev and ev['id_prefixes']:
        current_id_prefixes = ev['id_prefixes']
        current_id_prefixes.sort()
        if current_key in constraints_dict:
            constraints_dict[current_key]['id_prefixes'] = current_id_prefixes
        else:
            constraints_dict[current_key] = {'id_prefixes': current_id_prefixes}
    if 'pattern' in ev and ev['pattern']:
        if current_key in constraints_dict:
            constraints_dict[current_key]['pattern'] = ev['pattern']
        else:
            constraints_dict[current_key] = {'pattern': ev['pattern']}
    if 'slot_usage' in ev and ev['slot_usage']:
        for uk, uv in ev['slot_usage'].items():
            current_key = f"{uk} used in {ek}"
            if 'id_prefixes' in uv and uv['id_prefixes']:
                current_id_prefixes = uv['id_prefixes']
                current_id_prefixes.sort()
                if current_key in constraints_dict:
                    constraints_dict[current_key]['id_prefixes'] = current_id_prefixes
                else:
                    constraints_dict[current_key] = {'id_prefixes': current_id_prefixes}
            if 'pattern' in uv and uv['pattern']:
                if current_key in constraints_dict:
                    constraints_dict[current_key]['pattern'] = uv['pattern']
                else:
                    constraints_dict[current_key] = {'pattern': uv['pattern']}

pprint.pprint(constraints_dict)
turbomam commented 1 year ago
{'ClassDefinition ChemicalEntity': {'id_prefixes': ['CHEBI',
                                                    'CHEMBL.COMPOUND',
                                                    'DRUGBANK',
                                                    'HMDB',
                                                    'KEGG.COMPOUND',
                                                    'MESH',
                                                    'PUBCHEM.COMPOUND',
                                                    'cas']},
 'ClassDefinition GeneProduct': {'id_prefixes': ['PR', 'UniProtKB', 'gtpo']},
 'ClassDefinition OrthologyGroup': {'id_prefixes': ['CATH',
                                                    'EGGNOG',
                                                    'KEGG.ORTHOLOGY',
                                                    'PANTHER.FAMILY',
                                                    'PFAM',
                                                    'SUPFAM',
                                                    'TIGRFAM']},
 'ClassDefinition Pathway': {'id_prefixes': ['COG', 'KEGG_PATHWAY']},
 'ClassDefinition Reaction': {'id_prefixes': ['EC',
                                              'GO',
                                              'KEGG.REACTION',
                                              'MetaCyc',
                                              'MetaNetX',
                                              'RHEA',
                                              'RetroRules',
                                              'SEED']},
 'SlotDefinition alternative_identifiers': {'pattern': '^[a-zA-Z0-9][a-zA-Z0-9_\\.]+:[a-zA-Z0-9_][a-zA-Z0-9_\\-\\/\\.,]*$'},
 'SlotDefinition dna_cont_well': {'pattern': '^(?!A1|A12|H1|H12)(([A-H][1-9])|([A-H]1[0-2]))$'},
 'SlotDefinition dois': {'pattern': '^doi:10.\\d{2,9}/.*$'},
 'SlotDefinition external_database_identifiers': {'pattern': '^[a-zA-Z0-9][a-zA-Z0-9_\\.]+:[a-zA-Z0-9_][a-zA-Z0-9_\\-\\/\\.,]*$'},
 'SlotDefinition gnps_task_identifiers': {'pattern': '^gnps\\.task:[a-f0-9]+$'},
 'SlotDefinition gold_analysis_project_identifiers': {'pattern': '^gold:Ga[0-9]+$'},
 'SlotDefinition gold_biosample_identifiers': {'pattern': '^gold:Gb[0-9]+$'},
 'SlotDefinition gold_sequencing_project_identifiers': {'pattern': '^gold:Gp[0-9]+$'},
 'SlotDefinition gold_study_identifiers': {'pattern': '^gold:Gs[0-9]+$'},
 'SlotDefinition id': {'pattern': '^[a-zA-Z0-9][a-zA-Z0-9_\\.]+:[a-zA-Z0-9_][a-zA-Z0-9_\\-\\/\\.,]*$'},
 'SlotDefinition img_identifiers': {'pattern': '^img\\.taxon:[a-zA-Z0-9_][a-zA-Z0-9_\\/\\.]*$'},
 'SlotDefinition insdc_analysis_identifiers': {'pattern': '^insdc.sra:(E|D|S)RR[0-9]{6,}$'},
 'SlotDefinition insdc_assembly_identifiers': {'pattern': '^insdc.sra:[A-Z]+[0-9]+(\\.[0-9]+)?$'},
 'SlotDefinition insdc_bioproject_identifiers': {'pattern': '^bioproject:PRJ[DEN][A-Z][0-9]+$'},
 'SlotDefinition insdc_biosample_identifiers': {'pattern': '^biosample:SAM[NED]([A-Z])?[0-9]+$'},
 'SlotDefinition insdc_experiment_identifiers': {'pattern': '^insdc.sra:(E|D|S)RX[0-9]{6,}$'},
 'SlotDefinition insdc_secondary_sample_identifiers': {'pattern': '^biosample:(E|D|S)RS[0-9]{6,}$'},
 'SlotDefinition insdc_sra_ena_study_identifiers': {'pattern': '^insdc.sra:(E|D|S)RP[0-9]{6,}$'},
 'SlotDefinition jgi_portal_study_identifiers': {'id_prefixes': ['jgi.proposal'],
                                                 'pattern': '^jgi.proposal:\\d+$'},
 'SlotDefinition massive_study_identifiers': {'pattern': '^MASSIVE:'},
 'SlotDefinition mgnify_project_identifiers': {'pattern': '^mgnify.proj:[A-Z]+[0-9]+$'},
 'SlotDefinition pres_animal_insect': {'pattern': '^(cat|dog|rodent|snake|other);\\d+$'},
 'SlotDefinition rna_cont_well': {'pattern': '^(?!A1|A12|H1|H12)(([A-H][1-9])|([A-H]1[0-2]))$'},
 'TypeDefinition external_identifier': {'pattern': '^[a-zA-Z0-9][a-zA-Z0-9_\\.]+:[a-zA-Z0-9_][a-zA-Z0-9_\\-\\/\\.,]*$'},
 'fire used in Biosample': {'pattern': '^[12]\\d{3}(?:(?:-(?:0[1-9]|1[0-2]))(?:-(?:0[1-9]|[12]\\d|3[01]))?)?(\\s+to\\s+[12]\\d{3}(?:(?:-(?:0[1-9]|1[0-2]))(?:-(?:0[1-9]|[12]\\d|3[01]))?)?)?$'},
 'has_function used in FunctionalAnnotation': {'pattern': '^(KEGG_PATHWAY:\\w{2,4}\\d{5}|KEGG.REACTION:R\\d+|RHEA:\\d{5}|MetaCyc:[A-Za-z0-9+_.%-:]+|EC:\\d{1,2}(\\.\\d{0,3}){0,3}|GO:\\d{7}|MetaNetX:(MNXR\\d+|EMPTY)|SEED:\\w+|KEGG\\.ORTHOLOGY:K\\d+|EGGNOG:\\w+|PFAM:PF\\d{5}|TIGRFAM:TIGR\\d+|SUPFAM:\\w+|CATH:[1-6]\\.[0-9]+\\.[0-9]+\\.[0-9]+|PANTHER.FAMILY:PTHR\\d{5}(\\:SF\\d{1,3})?)$'},
 'id used in OntologyClass': {'pattern': '^[a-zA-Z0-9][a-zA-Z0-9_\\.]+:[a-zA-Z0-9_][a-zA-Z0-9_\\-\\/\\.,]*$'}}
turbomam commented 1 year ago

see nmdc_schema/list_id_prefixes_and_patterns.py