cedadev / search-futures

Future Search Architecture
BSD 2-Clause "Simplified" License
0 stars 0 forks source link

Move to use collection search #181

Open rhysrevans3 opened 1 year ago

rhysrevans3 commented 1 year ago

With the switch from asset to collection search the collection description structure will need to change.

Current collection description:

paths:
  - /badc/cmip6/data/

asset:
   extraction_methods:
    - name: defaults
      inputs:
        license: CC-BY-SA-4.0
        permitted_use:
          - academic
          - educational
    - name: regex
      inputs:
        regex: '^(?P<var_id>[^_]+)_(?P<table_id>[^_]+)_(?P<source_id>[^_]+)_(?P<experiment_id>[^_]+)'
      pre_processors:
        - name: filename_reducer
  post_extraction_methods:
    - name: vocab
      inputs:
        vocab: cmip6
        strict: False
        terms:
          - var_id
          - table_id
          - source_id
          - experiment_id
          - permitted_use

item:
  id: 
    method: hash
    inputs:
      terms:
        - table_id
        - source_id
        - experiment_id
  extraction_methods:
    - name: elasticsearch_aggregator
      inputs:
        list:
          - table_id
          - source_id
          - experiment_id
          - var_id

collection:
  id:
    name: defaults
    inputs: cmip6
  extraction_methods:
    - name: elasticsearch_aggregator
      inputs:
        list:
          - table_id
          - source_id
          - experiment_id

Option 1 include type and subtype to track level and STAC types:

paths:
  - /badc/cmip6/data/

item:
   type: item
   extraction_methods:
    - name: defaults
      inputs:
        license: CC-BY-SA-4.0
        permitted_use:
          - academic
          - educational
    - name: regex
      inputs:
        regex: '^(?P<var_id>[^_]+)_(?P<table_id>[^_]+)_(?P<source_id>[^_]+)_(?P<experiment_id>[^_]+)'
      pre_processors:
        - name: filename_reducer
  post_extraction_methods:
    - name: vocab
      inputs:
        vocab: cmip6
        strict: False
        terms:
          - var_id
          - table_id
          - source_id
          - experiment_id
          - permitted_use
  assets:
    - How to extract the assets?

data_collection:
  type: collection
  sub_type: item
  id: 
    method: hash
    inputs:
      terms:
        - table_id
        - source_id
        - experiment_id
  extraction_methods:
    - name: elasticsearch_aggregator
      inputs:
        list:
          - table_id
          - source_id
          - experiment_id
          - var_id

super_collection:
  type: collection
  sub_type: data_collection
  id:
    name: defaults
    inputs: cmip6
  extraction_methods:
    - name: elasticsearch_aggregator
      inputs:
        list:
          - table_id
          - source_id
          - experiment_id

Option 2 nested structure for defining sub types:

paths:
  - /badc/cmip6/data/

collection:
  name: super_collection
  id:
    name: defaults
    inputs: cmip6
  extraction_methods:
    - name: elasticsearch_aggregator
      inputs:
        list:
          - table_id
          - source_id
          - experiment_id

  subs:
    collection:
      name: data_collection
      id: 
        method: hash
        inputs:
          terms:
            - table_id
            - source_id
            - experiment_id
      extraction_methods:
        - name: elasticsearch_aggregator
          inputs:
            list:
              - table_id
              - source_id
              - experiment_id
              - var_id

      subs:
        item:
           name: item
           extraction_methods:
            - name: defaults
              inputs:
                license: CC-BY-SA-4.0
                permitted_use:
                  - academic
                  - educational
            - name: regex
              inputs:
                regex: '^(?P<var_id>[^_]+)_(?P<table_id>[^_]+)_(?P<source_id>[^_]+)_(?P<experiment_id>[^_]+)'
              pre_processors:
                - name: filename_reducer
          post_extraction_methods:
            - name: vocab
              inputs:
                vocab: cmip6
                strict: False
                terms:
                  - var_id
                  - table_id
                  - source_id
                  - experiment_id
                  - permitted_use
          assets:
            - How to extract the assets?

Option 3 reverse nested:

paths:
  - /badc/cmip6/data/

item:
   name: item
   extraction_methods:
    - name: defaults
      inputs:
        license: CC-BY-SA-4.0
        permitted_use:
          - academic
          - educational
    - name: regex
      inputs:
        regex: '^(?P<var_id>[^_]+)_(?P<table_id>[^_]+)_(?P<source_id>[^_]+)_(?P<experiment_id>[^_]+)'
      pre_processors:
        - name: filename_reducer
  post_extraction_methods:
    - name: vocab
      inputs:
        vocab: cmip6
        strict: False
        terms:
          - var_id
          - table_id
          - source_id
          - experiment_id
          - permitted_use
  assets:
    - How to extract the assets?

  parent:
    collection:
      name: data_collection
      id: 
        method: hash
        inputs:
          terms:
            - table_id
            - source_id
            - experiment_id
      extraction_methods:
        - name: elasticsearch_aggregator
          inputs:
            list:
              - table_id
              - source_id
              - experiment_id
              - var_id

      parent:
        collection:
          name: super_collection
          id:
            name: defaults
            inputs: cmip6
          extraction_methods:
            - name: elasticsearch_aggregator
              inputs:
                list:
                  - table_id
                  - source_id
                  - experiment_id