DataBiosphere / azul

Metadata indexer and query service used for AnVIL, HCA, LungMAP, and CGP
Apache License 2.0
7 stars 2 forks source link

Programatically ensure consistency of fields and facets #4852

Open nadove-ucsc opened 1 year ago

nadove-ucsc commented 1 year ago
theathorn commented 1 year ago

Spike to provide details.

hannes-ucsc commented 1 year ago

I don't remember what we had in mind here, or what, exactly, we thought the issue was.

nadove-ucsc commented 1 year ago

I think this was motivated by https://github.com/DataBiosphere/azul/pull/4849 ?

achave11-ucsc commented 1 year ago

@hannes-ucsc: "I remember now. Maybe we should consider explicitly listing fields for which we do not want a facet. With that we can then enforce that every field is mentioned in the facet config, either as a proper facet or as a field without a facet."

achave11-ucsc commented 1 year ago

@hannes-ucsc to decide

hannes-ucsc commented 1 year ago

Here's a sketch that eliminates .facets altogether

Index: test/service/test_response.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/test/service/test_response.py b/test/service/test_response.py
--- a/test/service/test_response.py (revision 0b30d2f952abb368e2c220932291f5490ef64d2a)
+++ b/test/service/test_response.py (date 1673317000369)
@@ -60,6 +60,7 @@
     configure_test_logging,
 )
 from azul.plugins import (
+    Field,
     FieldPath,
 )
 from azul.plugins.metadata.hca.service.response import (
@@ -3444,11 +3445,7 @@
         super().tearDownClass()

     @property
-    def facets(self) -> Sequence[str]:
-        return self.app_module.app.metadata_plugin.facets
-
-    @property
-    def field_mapping(self) -> Mapping[str, FieldPath]:
+    def field_mapping(self) -> Mapping[str, Field]:
         return self.app_module.app.metadata_plugin.field_mapping

     def entity_types(self) -> list[str]:
@@ -3479,8 +3476,9 @@
                         'order': 'asc'
                     },
                     'termFacets': {
-                        facet: {'terms': [], 'total': 0, 'type': 'terms'}
-                        for facet in self.facets
+                        field_name: {'terms': [], 'total': 0, 'type': 'terms'}
+                        for field_name, field in self.field_mapping
+                        if field.is_facet
                     }
                 }
                 self.assertEqual(expected_response, response.json())
Index: src/azul/service/elasticsearch_service.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/azul/service/elasticsearch_service.py b/src/azul/service/elasticsearch_service.py
--- a/src/azul/service/elasticsearch_service.py (revision 0b30d2f952abb368e2c220932291f5490ef64d2a)
+++ b/src/azul/service/elasticsearch_service.py (date 1673316917088)
@@ -275,13 +275,12 @@
         return aggregation_stage.wrap(chain)

     def prepare_request(self, request: Search) -> Search:
-        field_mapping = self.plugin.field_mapping
-        for facet in self.plugin.facets:
-            # FIXME: Aggregation filters may be redundant when post_filter is false
-            #        https://github.com/DataBiosphere/azul/issues/3435
-            aggregate = self._prepare_aggregation(facet=facet,
-                                                  facet_path=field_mapping[facet])
-            request.aggs.bucket(facet, aggregate)
+        for field_name, field in self.plugin.field_mapping:
+            if field.is_facet:
+                # FIXME: Aggregation filters may be redundant when post_filter is false
+                #        https://github.com/DataBiosphere/azul/issues/3435
+                aggregate = self._prepare_aggregation(facet=field_name, facet_path=field.path)
+                request.aggs.bucket(field_name, aggregate)
         self._annotate_aggs_for_translation(request)
         return request

Index: src/azul/plugins/metadata/hca/__init__.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/azul/plugins/metadata/hca/__init__.py b/src/azul/plugins/metadata/hca/__init__.py
--- a/src/azul/plugins/metadata/hca/__init__.py (revision 0b30d2f952abb368e2c220932291f5490ef64d2a)
+++ b/src/azul/plugins/metadata/hca/__init__.py (date 1673317044304)
@@ -247,46 +247,6 @@
     def source_id_field(self) -> str:
         return 'sourceId'

-    @property
-    def facets(self) -> Sequence[str]:
-        return [
-            'organ',
-            'organPart',
-            'modelOrgan',
-            'modelOrganPart',
-            'effectiveOrgan',
-            'specimenOrgan',
-            'specimenOrganPart',
-            'sampleEntityType',
-            'libraryConstructionApproach',
-            'nucleicAcidSource',
-            'genusSpecies',
-            'organismAge',
-            'biologicalSex',
-            'sampleDisease',
-            'specimenDisease',
-            'donorDisease',
-            'developmentStage',
-            'instrumentManufacturerModel',
-            'pairedEnd',
-            'workflow',
-            'assayType',
-            'project',
-            'fileFormat',
-            'fileSource',
-            'isIntermediate',
-            'contentDescription',
-            'laboratory',
-            'preservationMethod',
-            'projectTitle',
-            'cellLineType',
-            'selectedCellType',
-            'projectDescription',
-            'institution',
-            'contactName',
-            'publicationTitle'
-        ]
-
     @property
     def manifest(self) -> ManifestConfig:
         return {
Index: test/service/test_request_builder.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/test/service/test_request_builder.py b/test/service/test_request_builder.py
--- a/test/service/test_request_builder.py  (revision 0b30d2f952abb368e2c220932291f5490ef64d2a)
+++ b/test/service/test_request_builder.py  (date 1673317067216)
@@ -86,10 +86,6 @@
         def manifest(self) -> ManifestConfig:
             return {}

-        @property
-        def facets(self) -> Sequence[str]:
-            return []
-
     sources_filter = {
         "constant_score": {
             "filter": {
Index: src/azul/plugins/__init__.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py
--- a/src/azul/plugins/__init__.py  (revision 0b30d2f952abb368e2c220932291f5490ef64d2a)
+++ b/src/azul/plugins/__init__.py  (date 1673317089227)
@@ -41,6 +41,7 @@
 from azul.chalice import (
     Authentication,
 )
+from azul.collections import OrderedSet
 from azul.drs import (
     DRSClient,
 )
@@ -83,7 +84,14 @@
 FieldPathElement = str
 FieldPath = tuple[FieldPathElement, ...]

-FieldMapping = Mapping[FieldName, FieldPath]
+
+@attr.s(frozen=True, kw_only=False, auto_attribs=True)
+class Field:
+    path: FieldPath
+    is_facet: bool = True
+
+
+FieldMapping = Mapping[FieldName, Field]

 ColumnMapping = Mapping[FieldPathElement, FieldName]
 ManifestConfig = Mapping[FieldPath, ColumnMapping]
@@ -318,8 +326,13 @@
         """
         raise NotImplementedError

+    @attr.s(frozen=True, kw_only=False, auto_attribs=True)
+    class _Field:
+        name: str
+        is_facet: bool = True
+
     #: See :meth:`_field_mapping`
-    _FieldMapping2 = Mapping[FieldPathElement, FieldName]
+    _FieldMapping2 = Mapping[FieldPathElement, _Field]
     _FieldMapping1 = Mapping[FieldPathElement, Union[FieldName, _FieldMapping2]]
     _FieldMapping = Mapping[FieldPathElement, Union[FieldName, _FieldMapping1]]

@@ -332,21 +345,22 @@

         def invert(v: MetadataPlugin._FieldMapping,
                    *path: FieldPathElement
-                   ) -> Iterable[tuple[FieldName, FieldPath]]:
+                   ) -> Iterable[tuple[MetadataPlugin._Field, FieldPath]]:
             if isinstance(v, dict):
                 for k, v in v.items():
                     assert isinstance(k, FieldPathElement)
                     yield from invert(v, *path, k)
-            elif isinstance(v, FieldName):
+            elif isinstance(v, MetadataPlugin._Field):
                 yield v, path
             else:
                 assert False, v

         inversion = {}
-        for v, path in invert(self._field_mapping):
-            other_path = inversion.setdefault(v, path)
-            assert other_path == path, (
-                f'Field {v!r} has conflicting paths', path, other_path
+        for _field, path in invert(self._field_mapping):
+            field = Field(path, is_facet=_field.is_facet)
+            other_field = inversion.setdefault(_field.name, field)
+            assert other_field == field, (
+                f'Field {_field!r} has conflicts', field, other_field
             )
         return inversion

@@ -365,11 +379,6 @@
     @abstractmethod
     def source_id_field(self) -> str:
         raise NotImplementedError
-
-    @property
-    @abstractmethod
-    def facets(self) -> Sequence[str]:
-        raise NotImplementedError

     @property
     @abstractmethod
Index: src/azul/plugins/metadata/anvil/__init__.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py
--- a/src/azul/plugins/metadata/anvil/__init__.py   (revision 0b30d2f952abb368e2c220932291f5490ef64d2a)
+++ b/src/azul/plugins/metadata/anvil/__init__.py   (date 1673317078835)
@@ -89,8 +89,9 @@
             'document_id',
             'source_datarepo_row_ids'
         ]
+        f = MetadataPlugin._Field
         return {
-            'entity_id': 'entryId',
+            'entity_id': f('entryId', is_facet=False),
             'bundles': {
                 'uuid': 'bundleUuid',
                 'version': 'bundleVersion'
@@ -180,27 +181,6 @@
     def source_id_field(self) -> str:
         return 'sourceId'

-    @property
-    def facets(self) -> Sequence[str]:
-        return [
-            'activities.activity_type',
-            'activities.assay_type',
-            'activities.data_modality',
-            'biosamples.anatomical_site',
-            'biosamples.biosample_type',
-            'biosamples.disease',
-            'datasets.consent_group',
-            'datasets.data_use_permission',
-            'datasets.registered_identifier',
-            'datasets.title',
-            'donors.organism_type',
-            'donors.phenotypic_sex',
-            'donors.reported_ethnicity',
-            'files.data_modality',
-            'files.file_format',
-            'files.reference_assembly',
-        ]
-
     @property
     def manifest(self) -> ManifestConfig:
         return {

What's missing is

1) updating all call sites of .field_mapping to reflect that the values in the returned dictionary are now Field instances

2) annotating the concrete _field_mapping implementations to reflect which fields are facets.