Closed amarjandu closed 2 years ago
import fastavro import datetime from typing import cast from azul.plugins.metadata.hca.transform import FileTransformer from azul.service.avro_pfb import pfb_schema_from_field_types field_types = FileTransformer.field_types() pfb_schema = pfb_schema_from_field_types(field_types) parsed_schema = fastavro.parse_schema(cast(dict, pfb_schema)) bad_entity = { 'id': 'projects_90bf705c-d891-5ce2-aa54-094488b445c6', 'name': 'projects', 'object': {'document_id': ['90bf705c-d891-5ce2-aa54-094488b445c6'], 'submission_date': datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), 'update_date': datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), 'project_title': [ 'The cellular immune response to COVID-19 deciphered by single cell multi-omics across three UK centres'], 'project_short_name': ['Covid19PBMC'], 'laboratory': [''], 'institutions': ['Newcastle University'], 'publication_titles': [ 'The cellular immune response to COVID-19 deciphered by single cell multi-omics across three UK centres'], 'accessions': [{'domain': 'array_express', 'value': 'E-MTAB-10026'}, {'domain': 'dbgap', 'value': 'phs001836'}, {'domain': 'dbgap', 'value': 'phs001997.v1.p1'}, {'domain': 'ega', 'value': 'EGAD00000000002'}, {'domain': 'ega', 'value': 'EGAS00000000001'}], 'insdc_project_accessions': [''], 'geo_series_accessions': [''], 'array_express_accessions': ['E-MTAB-10026'], 'insdc_study_accessions': [''], 'supplementary_links': [''], '_type': ['project'], 'project_description': [''], 'contact_names': [''], 'contributors': [], 'publications': []}, 'relations': []} good_entity = { 'id': 'projects_6615efae-fca8-4dd2-a223-9cfcf30fe94d', 'name': 'projects', 'object': {'document_id': ['6615efae-fca8-4dd2-a223-9cfcf30fe94d'], 'submission_date': datetime.datetime(2018, 10, 10, 2, 23, 39, 569000, tzinfo=datetime.timezone.utc), 'update_date': datetime.datetime(2018, 10, 10, 2, 23, 47, 926000, tzinfo=datetime.timezone.utc), 'project_title': ['Q4_DEMO-Single cell RNA-seq of primary human glioblastomas'], 'project_short_name': ['integration/Smart-seq2/2018-10-10T02:23:36Z'], 'laboratory': [''], 'institutions': ['Fake Institution'], 'publication_titles': [''], 'accessions': [], 'insdc_project_accessions': [''], 'geo_series_accessions': [''], 'array_express_accessions': [''], 'insdc_study_accessions': [''], 'supplementary_links': [''], '_type': ['project'], 'project_description': [''], 'contact_names': [''], 'contributors': [], 'publications': []}, 'relations': []} with open('deleteThis', 'wb') as fp: fastavro.writer(fp, parsed_schema, [good_entity, bad_entity], validator=True) with open('deleteThis', 'rb') as fp: records = [] reader = fastavro.reader(fp) for r in reader: records.append(r) print(set(records[0]['object'].keys()) - set(records[1]['object'].keys())) assert set(records[0]['object'].keys()) == set(records[1]['object'].keys())
{'laboratory', 'contact_names', 'publication_titles', 'insdc_project_accessions', 'contributors', 'accessions', 'publications', 'array_express_accessions', 'geo_series_accessions', '_type', 'project_title', 'supplementary_links', 'insdc_study_accessions', 'institutions', 'project_description', 'project_short_name'} Traceback (most recent call last): File "/Users/amar/Library/Application Support/JetBrains/PyCharm2021.2/scratches/scratch_8.py", line 52, in <module> assert set(records[0]['object'].keys()) == set(records[1]['object'].keys()) AssertionError
Found when working on #3369, the expectation is that for the same entity type the writer consistently uses the relevant schema.
For demo, attempt to reproduce.
Found when working on #3369, the expectation is that for the same entity type the writer consistently uses the relevant schema.