DataBiosphere / azul

Metadata indexer and query service used for AnVIL, HCA, LungMAP, and CGP
Apache License 2.0
6 stars 2 forks source link

`fastavro.writer` heuristic for choosing record types can be defeated #3396

Closed amarjandu closed 2 years ago

amarjandu commented 3 years ago
import fastavro
import datetime
from typing import cast
from azul.plugins.metadata.hca.transform import FileTransformer
from azul.service.avro_pfb import pfb_schema_from_field_types

field_types = FileTransformer.field_types()
pfb_schema = pfb_schema_from_field_types(field_types)
parsed_schema = fastavro.parse_schema(cast(dict, pfb_schema))

bad_entity = {
    'id': 'projects_90bf705c-d891-5ce2-aa54-094488b445c6', 'name': 'projects',
    'object': {'document_id': ['90bf705c-d891-5ce2-aa54-094488b445c6'],
               'submission_date': datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc),
               'update_date': datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), 'project_title': [
            'The cellular immune response to COVID-19 deciphered by single cell multi-omics across three UK centres'],
               'project_short_name': ['Covid19PBMC'], 'laboratory': [''], 'institutions': ['Newcastle University'],
               'publication_titles': [
                   'The cellular immune response to COVID-19 deciphered by single cell multi-omics across three UK centres'],
               'accessions': [{'domain': 'array_express', 'value': 'E-MTAB-10026'},
                              {'domain': 'dbgap', 'value': 'phs001836'},
                              {'domain': 'dbgap', 'value': 'phs001997.v1.p1'},
                              {'domain': 'ega', 'value': 'EGAD00000000002'},
                              {'domain': 'ega', 'value': 'EGAS00000000001'}], 'insdc_project_accessions': [''],
               'geo_series_accessions': [''], 'array_express_accessions': ['E-MTAB-10026'],
               'insdc_study_accessions': [''], 'supplementary_links': [''], '_type': ['project'],
               'project_description': [''], 'contact_names': [''], 'contributors': [], 'publications': []},
    'relations': []}

good_entity = {
    'id': 'projects_6615efae-fca8-4dd2-a223-9cfcf30fe94d', 'name': 'projects',
    'object': {'document_id': ['6615efae-fca8-4dd2-a223-9cfcf30fe94d'],
               'submission_date': datetime.datetime(2018, 10, 10, 2, 23, 39, 569000, tzinfo=datetime.timezone.utc),
               'update_date': datetime.datetime(2018, 10, 10, 2, 23, 47, 926000, tzinfo=datetime.timezone.utc),
               'project_title': ['Q4_DEMO-Single cell RNA-seq of primary human glioblastomas'],
               'project_short_name': ['integration/Smart-seq2/2018-10-10T02:23:36Z'], 'laboratory': [''],
               'institutions': ['Fake Institution'], 'publication_titles': [''], 'accessions': [],
               'insdc_project_accessions': [''], 'geo_series_accessions': [''], 'array_express_accessions': [''],
               'insdc_study_accessions': [''], 'supplementary_links': [''], '_type': ['project'],
               'project_description': [''], 'contact_names': [''], 'contributors': [], 'publications': []},
    'relations': []}

with open('deleteThis', 'wb') as fp:
    fastavro.writer(fp, parsed_schema, [good_entity, bad_entity], validator=True)
with open('deleteThis', 'rb') as fp:
    records = []
    reader = fastavro.reader(fp)
    for r in reader:
        records.append(r)
    print(set(records[0]['object'].keys()) - set(records[1]['object'].keys()))
    assert set(records[0]['object'].keys()) == set(records[1]['object'].keys())
{'laboratory', 'contact_names', 'publication_titles', 'insdc_project_accessions', 'contributors', 'accessions', 'publications', 'array_express_accessions', 'geo_series_accessions', '_type', 'project_title', 'supplementary_links', 'insdc_study_accessions', 'institutions', 'project_description', 'project_short_name'}
Traceback (most recent call last):
  File "/Users/amar/Library/Application Support/JetBrains/PyCharm2021.2/scratches/scratch_8.py", line 52, in <module>
    assert set(records[0]['object'].keys()) == set(records[1]['object'].keys())
AssertionError

Found when working on #3369, the expectation is that for the same entity type the writer consistently uses the relevant schema.

hannes-ucsc commented 3 years ago

For demo, attempt to reproduce.