Closed adam-collins closed 9 months ago
This is working to add a new count field:
Add a CloneFieldUpdateProcessorFactory
and CountFieldValuesUpdateProcessorFactory
processor for each count field to solrconfig.xml
<updateRequestProcessorChain name="multivalue-counts" default="true">
<processor class="solr.CloneFieldUpdateProcessorFactory">
<str name="source">imageIDs</str>
<str name="dest">imageIDsCount</str>
</processor>
<processor class="solr.CountFieldValuesUpdateProcessorFactory">
<str name="fieldName">imageIDsCount</str>
</processor>
<processor class="solr.LogUpdateProcessorFactory" />
<processor class="solr.DistributedUpdateProcessorFactory" />
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
managed-schema
<field name="imageIDsCount" type="int" docValues="true" multiValued="false" indexed="true" />
List of existing multivalue fields. Not all of these will required a count
<field name="speciesGroup" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="speciesSubgroup" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="speciesListUid" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="imageIDs" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="soundIDs" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="videoIDs" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="multimedia" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="multimediaLicense" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="assertions" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="geospatialIssues" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="taxonomicIssues" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="occurrenceYear" type="date" docValues="true" multiValued="true" indexed="true" />
<field name="datasetID" type="string" docValues="true" indexed="true" multiValued="true" />
<field name="datasetName" type="string" docValues="true" indexed="true" multiValued="true"/>
<field name="identifiedBy" type="string" docValues="true" indexed="true" multiValued="true"/>
<field name="lifeStageLineage" type="string" docValues="true" indexed="true" multiValued="true"/>
<field name="otherCatalogNumbers" type="string" docValues="true" indexed="true" multiValued="true"/>
<field name="outlierLayer" type="string" docValues="true" indexed="true" multiValued="true"/>
<field name="preparations" type="string" docValues="true" indexed="true" multiValued="true"/>
<field name="recordedBy" type="string" docValues="true" indexed="true" multiValued="true"/>
<field name="samplingProtocol" type="string" docValues="true" indexed="true" multiValued="true"/>
<field name="typeStatus" type="string" docValues="true" indexed="true" multiValued="true" />
<field name="isRepresentativeOf" type="string" docValues="true" indexed="true" multiValued="true"/>
<field name="isDuplicateOf" type="string" docValues="true" indexed="true" multiValued="true"/>
<field name="duplicateType" type="string" docValues="true" indexed="true" multiValued="true"/>
<field name="duplicateJustification" type="string" docValues="true" indexed="true" multiValued="true"/>
<field name="identifiedByID" type="string" docValues="true" indexed="true" multiValued="true"/>
<field name="recordedByID" type="string" docValues="true" indexed="true" multiValued="true"/>
<field name="dataHubUid" type="string" docValues="true" indexed="true" multiValued="true" />
<field name="dataHubName" type="string" docValues="true" indexed="true" multiValued="true" />
<field name="assertionUserId" type="string" docValues="true" indexed="true" multiValued="true" />
<field name="contentTypes" type="string" multiValued="true" docValues="true" indexed="true" />
<field name="machineTags" type="string" docValues="true" indexed="true" multiValued="true" />
Requires discussion about what multivalue fields should have a count.
I'll make a call:
<field name="imageIDs" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="soundIDs" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="videoIDs" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="multimedia" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="assertions" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="geospatialIssues" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="taxonomicIssues" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="otherCatalogNumbers" type="string" docValues="true" indexed="true" multiValued="true"/>
<field name="typeStatus" type="string" docValues="true" indexed="true" multiValued="true" />
<field name="isRepresentativeOf" type="string" docValues="true" indexed="true" multiValued="true"/>
<field name="isDuplicateOf" type="string" docValues="true" indexed="true" multiValued="true"/>
<field name="contentTypes" type="string" multiValued="true" docValues="true" indexed="true" />
Could the requirement be achieved with the use of child documents for images ?
The recent work on adding publications (yet to be merged into biocache-service) makes use of child documents for links to publications.
Im wondering if the same approach for image metadata would support the count required.
The purpose of the Count field is for use in query filtering, `q=imageIDsCount:[2 TO ], and faceting
facets=imageIDsCount`. It also has some use as a stored field to reduce the end user processing on a download to get the same information.
I do not think that a one to many
child document relationship would support all of these requirements.
As a separate task I think storing the image metadata would be useful. This raises the question for downloads, do child documents get flattened and included in the single csv
, included as a separate file (e.g. DwCA style), excluded, or something else?
This raises the question for downloads, do child documents get flattened and included in the single csv, included as a separate file (e.g. DwCA style), excluded, or something else?
Probably the DwCA style makes the most sense. For downloads, i am thinking we can use a similar mechanism to whats in use for events which is based on pipelines. This is more scalable.
in prod
as per https://atlaslivingaustralia.slack.com/archives/CBYEJ2G49/p1680152183456459
The images field in biocache stores zero or more image identifiers per record, with multiple identifiers separated by |. Can we generate a new biocache field, that provides a count the number of identifiers per record? The practical use case is to support a novel query in galah; right now we can easily return the number of records matching a query, or the number of species, but not the number of images. If there was a count of images per record this becomes much easier to calculate.
This would be useful also for assertions.
Keeping in mind the impact on SOLR performance.