Open nadove-ucsc opened 1 year ago
Spike to provide details.
I don't remember what we had in mind here, or what, exactly, we thought the issue was.
I think this was motivated by https://github.com/DataBiosphere/azul/pull/4849 ?
@hannes-ucsc: "I remember now. Maybe we should consider explicitly listing fields for which we do not want a facet. With that we can then enforce that every field is mentioned in the facet config, either as a proper facet or as a field without a facet."
@hannes-ucsc to decide
Here's a sketch that eliminates .facets
altogether
Index: test/service/test_response.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/test/service/test_response.py b/test/service/test_response.py
--- a/test/service/test_response.py (revision 0b30d2f952abb368e2c220932291f5490ef64d2a)
+++ b/test/service/test_response.py (date 1673317000369)
@@ -60,6 +60,7 @@
configure_test_logging,
)
from azul.plugins import (
+ Field,
FieldPath,
)
from azul.plugins.metadata.hca.service.response import (
@@ -3444,11 +3445,7 @@
super().tearDownClass()
@property
- def facets(self) -> Sequence[str]:
- return self.app_module.app.metadata_plugin.facets
-
- @property
- def field_mapping(self) -> Mapping[str, FieldPath]:
+ def field_mapping(self) -> Mapping[str, Field]:
return self.app_module.app.metadata_plugin.field_mapping
def entity_types(self) -> list[str]:
@@ -3479,8 +3476,9 @@
'order': 'asc'
},
'termFacets': {
- facet: {'terms': [], 'total': 0, 'type': 'terms'}
- for facet in self.facets
+ field_name: {'terms': [], 'total': 0, 'type': 'terms'}
+ for field_name, field in self.field_mapping
+ if field.is_facet
}
}
self.assertEqual(expected_response, response.json())
Index: src/azul/service/elasticsearch_service.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/azul/service/elasticsearch_service.py b/src/azul/service/elasticsearch_service.py
--- a/src/azul/service/elasticsearch_service.py (revision 0b30d2f952abb368e2c220932291f5490ef64d2a)
+++ b/src/azul/service/elasticsearch_service.py (date 1673316917088)
@@ -275,13 +275,12 @@
return aggregation_stage.wrap(chain)
def prepare_request(self, request: Search) -> Search:
- field_mapping = self.plugin.field_mapping
- for facet in self.plugin.facets:
- # FIXME: Aggregation filters may be redundant when post_filter is false
- # https://github.com/DataBiosphere/azul/issues/3435
- aggregate = self._prepare_aggregation(facet=facet,
- facet_path=field_mapping[facet])
- request.aggs.bucket(facet, aggregate)
+ for field_name, field in self.plugin.field_mapping:
+ if field.is_facet:
+ # FIXME: Aggregation filters may be redundant when post_filter is false
+ # https://github.com/DataBiosphere/azul/issues/3435
+ aggregate = self._prepare_aggregation(facet=field_name, facet_path=field.path)
+ request.aggs.bucket(field_name, aggregate)
self._annotate_aggs_for_translation(request)
return request
Index: src/azul/plugins/metadata/hca/__init__.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/azul/plugins/metadata/hca/__init__.py b/src/azul/plugins/metadata/hca/__init__.py
--- a/src/azul/plugins/metadata/hca/__init__.py (revision 0b30d2f952abb368e2c220932291f5490ef64d2a)
+++ b/src/azul/plugins/metadata/hca/__init__.py (date 1673317044304)
@@ -247,46 +247,6 @@
def source_id_field(self) -> str:
return 'sourceId'
- @property
- def facets(self) -> Sequence[str]:
- return [
- 'organ',
- 'organPart',
- 'modelOrgan',
- 'modelOrganPart',
- 'effectiveOrgan',
- 'specimenOrgan',
- 'specimenOrganPart',
- 'sampleEntityType',
- 'libraryConstructionApproach',
- 'nucleicAcidSource',
- 'genusSpecies',
- 'organismAge',
- 'biologicalSex',
- 'sampleDisease',
- 'specimenDisease',
- 'donorDisease',
- 'developmentStage',
- 'instrumentManufacturerModel',
- 'pairedEnd',
- 'workflow',
- 'assayType',
- 'project',
- 'fileFormat',
- 'fileSource',
- 'isIntermediate',
- 'contentDescription',
- 'laboratory',
- 'preservationMethod',
- 'projectTitle',
- 'cellLineType',
- 'selectedCellType',
- 'projectDescription',
- 'institution',
- 'contactName',
- 'publicationTitle'
- ]
-
@property
def manifest(self) -> ManifestConfig:
return {
Index: test/service/test_request_builder.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/test/service/test_request_builder.py b/test/service/test_request_builder.py
--- a/test/service/test_request_builder.py (revision 0b30d2f952abb368e2c220932291f5490ef64d2a)
+++ b/test/service/test_request_builder.py (date 1673317067216)
@@ -86,10 +86,6 @@
def manifest(self) -> ManifestConfig:
return {}
- @property
- def facets(self) -> Sequence[str]:
- return []
-
sources_filter = {
"constant_score": {
"filter": {
Index: src/azul/plugins/__init__.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py
--- a/src/azul/plugins/__init__.py (revision 0b30d2f952abb368e2c220932291f5490ef64d2a)
+++ b/src/azul/plugins/__init__.py (date 1673317089227)
@@ -41,6 +41,7 @@
from azul.chalice import (
Authentication,
)
+from azul.collections import OrderedSet
from azul.drs import (
DRSClient,
)
@@ -83,7 +84,14 @@
FieldPathElement = str
FieldPath = tuple[FieldPathElement, ...]
-FieldMapping = Mapping[FieldName, FieldPath]
+
+@attr.s(frozen=True, kw_only=False, auto_attribs=True)
+class Field:
+ path: FieldPath
+ is_facet: bool = True
+
+
+FieldMapping = Mapping[FieldName, Field]
ColumnMapping = Mapping[FieldPathElement, FieldName]
ManifestConfig = Mapping[FieldPath, ColumnMapping]
@@ -318,8 +326,13 @@
"""
raise NotImplementedError
+ @attr.s(frozen=True, kw_only=False, auto_attribs=True)
+ class _Field:
+ name: str
+ is_facet: bool = True
+
#: See :meth:`_field_mapping`
- _FieldMapping2 = Mapping[FieldPathElement, FieldName]
+ _FieldMapping2 = Mapping[FieldPathElement, _Field]
_FieldMapping1 = Mapping[FieldPathElement, Union[FieldName, _FieldMapping2]]
_FieldMapping = Mapping[FieldPathElement, Union[FieldName, _FieldMapping1]]
@@ -332,21 +345,22 @@
def invert(v: MetadataPlugin._FieldMapping,
*path: FieldPathElement
- ) -> Iterable[tuple[FieldName, FieldPath]]:
+ ) -> Iterable[tuple[MetadataPlugin._Field, FieldPath]]:
if isinstance(v, dict):
for k, v in v.items():
assert isinstance(k, FieldPathElement)
yield from invert(v, *path, k)
- elif isinstance(v, FieldName):
+ elif isinstance(v, MetadataPlugin._Field):
yield v, path
else:
assert False, v
inversion = {}
- for v, path in invert(self._field_mapping):
- other_path = inversion.setdefault(v, path)
- assert other_path == path, (
- f'Field {v!r} has conflicting paths', path, other_path
+ for _field, path in invert(self._field_mapping):
+ field = Field(path, is_facet=_field.is_facet)
+ other_field = inversion.setdefault(_field.name, field)
+ assert other_field == field, (
+ f'Field {_field!r} has conflicts', field, other_field
)
return inversion
@@ -365,11 +379,6 @@
@abstractmethod
def source_id_field(self) -> str:
raise NotImplementedError
-
- @property
- @abstractmethod
- def facets(self) -> Sequence[str]:
- raise NotImplementedError
@property
@abstractmethod
Index: src/azul/plugins/metadata/anvil/__init__.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py
--- a/src/azul/plugins/metadata/anvil/__init__.py (revision 0b30d2f952abb368e2c220932291f5490ef64d2a)
+++ b/src/azul/plugins/metadata/anvil/__init__.py (date 1673317078835)
@@ -89,8 +89,9 @@
'document_id',
'source_datarepo_row_ids'
]
+ f = MetadataPlugin._Field
return {
- 'entity_id': 'entryId',
+ 'entity_id': f('entryId', is_facet=False),
'bundles': {
'uuid': 'bundleUuid',
'version': 'bundleVersion'
@@ -180,27 +181,6 @@
def source_id_field(self) -> str:
return 'sourceId'
- @property
- def facets(self) -> Sequence[str]:
- return [
- 'activities.activity_type',
- 'activities.assay_type',
- 'activities.data_modality',
- 'biosamples.anatomical_site',
- 'biosamples.biosample_type',
- 'biosamples.disease',
- 'datasets.consent_group',
- 'datasets.data_use_permission',
- 'datasets.registered_identifier',
- 'datasets.title',
- 'donors.organism_type',
- 'donors.phenotypic_sex',
- 'donors.reported_ethnicity',
- 'files.data_modality',
- 'files.file_format',
- 'files.reference_assembly',
- ]
-
@property
def manifest(self) -> ManifestConfig:
return {
What's missing is
1) updating all call sites of .field_mapping
to reflect that the values in the returned dictionary are now Field
instances
2) annotating the concrete _field_mapping
implementations to reflect which fields are facets.