HDFGroup / hdf5-json

Specification and tools for representing HDF5 in JSON
https://hdf5-json.readthedocs.io
Other
72 stars 25 forks source link

Investigate the source of roundtrip JSON -> HDF5 -> JSON difference #37

Closed ghost closed 2 years ago

ghost commented 8 years ago

This is the input HDF5/JSON:

{
    "apiVersion": "1.0.0",
    "datasets": {
        "6497e74e-6e8e-4290-bee9-535f5a66f665": {
            "alias": [
                "/a"
            ],
            "attributes": [
                {
                    "creationProperties": {
                        "nameCharEncoding": "H5T_CSET_UTF8"
                    },
                    "description": "",
                    "id": "d2922e88-a7c1-4013-bce5-b6f2f78baa36",
                    "name": "DIMENSION_LIST",
                    "shape": {
                        "class": "H5S_SIMPLE",
                        "dims": [
                            1
                        ],
                        "maxdims": [
                            1
                        ]
                    },
                    "type": {
                        "base": {
                            "base": "H5T_STD_REF_OBJ",
                            "class": "H5T_REFERENCE"
                        },
                        "class": "H5T_VLEN"
                    },
                    "value": [
                        [
                            "datasets/edd5c2fe-db6d-4916-9e68-069c4df80005"
                        ]
                    ]
                }
            ],
            "creationProperties": {
                "layout": {
                    "class": "H5D_CONTIGUOUS"
                }
            },
            "description": "",
            "shape": {
                "class": "H5S_SIMPLE",
                "dims": [
                    0,
                    3
                ],
                "maxdims": [
                    "H5S_UNLIMITED",
                    3
                ]
            },
            "type": {
                "base": "H5T_IEEE_F32LE",
                "class": "H5T_FLOAT"
            }
        },
        "edd5c2fe-db6d-4916-9e68-069c4df80005": {
            "alias": [
                "/m"
            ],
            "attributes": [
                {
                    "creationProperties": {
                        "nameCharEncoding": "H5T_CSET_UTF8"
                    },
                    "description": "",
                    "id": "3d1ad24b-514c-41d8-b019-fe89933c7505",
                    "name": "CLASS",
                    "shape": {
                        "class": "H5S_SCALAR"
                    },
                    "type": {
                        "charSet": "H5T_CSET_ASCII",
                        "class": "H5T_STRING",
                        "length": 16,
                        "strPad": "H5T_STR_NULLTERM"
                    },
                    "value": "DIMENSION_SCALE"
                },
                {
                    "creationProperties": {
                        "nameCharEncoding": "H5T_CSET_UTF8"
                    },
                    "description": "",
                    "id": "dee3c381-0f35-4d84-8b34-71aa8c8deb21",
                    "name": "REFERENCE_LIST",
                    "shape": {
                        "class": "H5S_SIMPLE",
                        "dims": [
                            1
                        ],
                        "maxdims": [
                            1
                        ]
                    },
                    "type": {
                        "class": "H5T_COMPOUND",
                        "fields": [
                            {
                                "name": "dataset",
                                "type": {
                                    "base": "H5T_STD_REF_OBJ",
                                    "class": "H5T_REFERENCE"
                                }
                            },
                            {
                                "name": "index",
                                "type": {
                                    "base": "H5T_STD_I32LE",
                                    "class": "H5T_INTEGER"
                                }
                            }
                        ]
                    },
                    "value": [
                        [
                            "datasets/6497e74e-6e8e-4290-bee9-535f5a66f665",
                            0
                        ]
                    ]
                }
            ],
            "creationProperties": {
                "layout": {
                    "class": "H5D_CONTIGUOUS"
                }
            },
            "description": "",
            "shape": {
                "class": "H5S_SIMPLE",
                "dims": [
                    0
                ],
                "maxdims": [
                    "H5S_UNLIMITED"
                ]
            },
            "type": {
                "base": "H5T_IEEE_F32LE",
                "class": "H5T_FLOAT"
            }
        }
    },
    "groups": {
        "31c7d987-47ce-4a03-92f7-2d4b3f0e5fb5": {
            "alias": [
                "/"
            ],
            "attributes": [],
            "description": "Group: /",
            "links": [
                {
                    "class": "H5L_TYPE_HARD",
                    "collection": "datasets",
                    "id": "6497e74e-6e8e-4290-bee9-535f5a66f665",
                    "title": "a"
                },
                {
                    "class": "H5L_TYPE_HARD",
                    "collection": "datasets",
                    "id": "edd5c2fe-db6d-4916-9e68-069c4df80005",
                    "title": "m"
                }
            ]
        }
    },
    "id": "31c7d987-47ce-4a03-92f7-2d4b3f0e5fb5",
    "root": "31c7d987-47ce-4a03-92f7-2d4b3f0e5fb5"
}
jreadey commented 8 years ago

The "DIMENSION_LIST" attribute of dataset 'a/' looks like the following in the input file above:

{
                    "creationProperties": {
                        "nameCharEncoding": "H5T_CSET_UTF8"
                    },
                    "description": "",
                    "id": "d2922e88-a7c1-4013-bce5-b6f2f78baa36",
                    "name": "DIMENSION_LIST",
                    "shape": {
                        "class": "H5S_SIMPLE",
                        "dims": [
                            1
                        ],
                        "maxdims": [
                            1
                        ]
                    },
                    "type": {
                        "base": {
                            "base": "H5T_STD_REF_OBJ",
                            "class": "H5T_REFERENCE"
                        },
                        "class": "H5T_VLEN"
                    },
                    "value": [
                        [
                            "datasets/edd5c2fe-db6d-4916-9e68-069c4df80005"
                        ]
                    ]
                }

The same attribute after converting to HDF5 and back to JSON:

{
                    "name": "DIMENSION_LIST", 
                    "shape": {
                        "class": "H5S_SIMPLE", 
                        "dims": [
                            2
                        ]
                    }, 
                    "type": {
                        "base": {
                            "base": "H5T_STD_REF_OBJ", 
                            "class": "H5T_REFERENCE"
                        }, 
                        "class": "H5T_VLEN"
                    }, 
                    "value": [
                        [
                            "datasets/bb44ec80-5fbd-11e5-a1db-3c15c2da029e"
                        ], 
                        []
                    ]
                }

It looks like the dataspace got extended by 1 and a null value added to the end. This is happening on conversion to hdf5. From output of h5dump:

ATTRIBUTE "DIMENSION_LIST" {
         DATATYPE  H5T_VLEN { H5T_REFERENCE { H5T_STD_REF_OBJECT }}
         DATASPACE  SIMPLE { ( 2 ) / ( 2 ) }
         DATA {
         (0): (DATASET 2438 /m ), ()
         }
      }

I'll look into this.

What's up with the 'description' key? That is not in the JSON spec.

Also, I have a question about creationProperties. Attribute creationProperties are not supported currently in jsontoh5.py. I'll open a separate issue about this.

ghost commented 8 years ago

What's up with the 'description' key? That is not in the JSON spec.

It is an extra key added for Product Designer. A user can add textual description to any element.

jreadey commented 8 years ago

I think this is working as designed... Since the dataset the dimension scale is attaching to is 2d, each element of the dimension list has two place holders. If only one scale is attached, the other spot remains empty.

If you run the following Python sample:

import h5py
import numpy as np

f = h5py.File("mydimscale2d.h5", "w")
dset = f.create_dataset('temperatures', (10,10), dtype='f')
f.create_dataset('scale_x', data=np.arange(10)*10e3)
dset.dims.create_scale(f['scale_x'], "x axis")
dset.dims[0].attach_scale(f['scale_x'])
f.close()

And look at the h5dump output you'll see there's an empty element in the dimension_list.

@ajelenak-thg - what do you think?

ghost commented 8 years ago

I agree. It makes sense to have a dimension list placeholder for each dataset dimension even when it is not used.

jreadey commented 8 years ago

Where there any other roundtrip differences in the JSON above that should be investigated?

ghost commented 8 years ago

Not sure, perhaps Joe Lee can chime in. (Cannot @mention him in this repo?)

hyoklee commented 8 years ago

I don't see any other difference. Can you fix jsontoh5 to allow extra [] in DIMENSION_LIST value so that HPD server doesn't throw the following error message?

hpdws.publish():500
{"link": [{"href": "https://hpd-ws.herokuapp.com/template/ec06b81c-4538-48ad-b152-e905adc7f633/publish", "rel": "retry", "title": "Template file"}, {"href": "https://hpd-ws.herokuapp.com/entity/ec06b81c-4538-48ad-b152-e905adc7f633", "rel": "version", "title": "test_unlmited3 entity version HEAD"}, {"href": "https://hpd-ws.herokuapp.com/project/Joe", "rel": "project", "title": "Project Joe"}], "message": "[Errno 22] Invalid dimension list value"}
hyoklee commented 8 years ago

Also, should HPD server throw an error if DIMENSION_LIST size doesn't match the rank of dataset?

jreadey commented 8 years ago

@hyoklee - I'm not sure what change you are asking for in h5tojson... currently the h5 output of DIMENSION_LIST will have n elements (where n is the rank of the parent dataset), regardless of the number of values in the dimension list.

ghost commented 8 years ago

For the record, this is the HDF5/JSON that causes the error reported in Joe's comment.

{
    "apiVersion": "1.0.0",
    "datasets": {
        "fd252e22-6231-11e5-b806-ec814d819d13": {
            "alias": [
                "/a"
            ],
            "attributes": [
                {
                    "creationProperties": {
                        "nameCharEncoding": "H5T_CSET_UTF8"
                    },
                    "description": "",
                    "id": "410885b7-7bef-47a3-9c32-8bfcb1aa7340",
                    "name": "DIMENSION_LIST",
                    "shape": {
                        "class": "H5S_SIMPLE",
                        "dims": [
                            2
                        ],
                        "maxdims": [
                            2
                        ]
                    },
                    "type": {
                        "base": {
                            "base": "H5T_STD_REF_OBJ",
                            "class": "H5T_REFERENCE"
                        },
                        "class": "H5T_VLEN"
                    },
                    "value": [
                        [
                            "datasets/fd252e24-6231-11e5-8986-ec814d819d13"
                        ],
                        []
                    ]
                }
            ],
            "creationProperties": {
                "fillValue": 0.0,
                "layout": {
                    "class": "H5D_CHUNKED",
                    "dims": [
                        512,
                        3
                    ]
                }
            },
            "description": "",
            "shape": {
                "class": "H5S_SIMPLE",
                "dims": [
                    0,
                    3
                ],
                "maxdims": [
                    "H5S_UNLIMITED",
                    3
                ]
            },
            "type": {
                "base": "H5T_IEEE_F32LE",
                "class": "H5T_FLOAT"
            }
        },
        "fd252e23-6231-11e5-8cdb-ec814d819d13": {
            "alias": [
                "/b"
            ],
            "attributes": [
                {
                    "creationProperties": {
                        "nameCharEncoding": "H5T_CSET_UTF8"
                    },
                    "description": "",
                    "id": "f7e49a91-5480-4849-ac46-c67a342aaec7",
                    "name": "DIMENSION_LIST",
                    "shape": {
                        "class": "H5S_SIMPLE",
                        "dims": [
                            2
                        ],
                        "maxdims": [
                            2
                        ]
                    },
                    "type": {
                        "base": {
                            "base": "H5T_STD_REF_OBJ",
                            "class": "H5T_REFERENCE"
                        },
                        "class": "H5T_VLEN"
                    },
                    "value": [
                        [
                            "datasets/fd252e25-6231-11e5-8996-ec814d819d13"
                        ],
                        []
                    ]
                }
            ],
            "creationProperties": {
                "fillValue": 0.0,
                "layout": {
                    "class": "H5D_CHUNKED",
                    "dims": [
                        128,
                        32
                    ]
                }
            },
            "description": "",
            "shape": {
                "class": "H5S_SIMPLE",
                "dims": [
                    0,
                    128
                ],
                "maxdims": [
                    "H5S_UNLIMITED",
                    128
                ]
            },
            "type": {
                "base": "H5T_IEEE_F32LE",
                "class": "H5T_FLOAT"
            }
        },
        "fd252e24-6231-11e5-8986-ec814d819d13": {
            "alias": [
                "/m"
            ],
            "attributes": [
                {
                    "creationProperties": {
                        "nameCharEncoding": "H5T_CSET_UTF8"
                    },
                    "description": "",
                    "id": "c3b3d62d-432b-47de-862b-70b68e5499ea",
                    "name": "CLASS",
                    "shape": {
                        "class": "H5S_SCALAR"
                    },
                    "type": {
                        "charSet": "H5T_CSET_ASCII",
                        "class": "H5T_STRING",
                        "length": 16,
                        "strPad": "H5T_STR_NULLTERM"
                    },
                    "value": "DIMENSION_SCALE"
                },
                {
                    "creationProperties": {
                        "nameCharEncoding": "H5T_CSET_UTF8"
                    },
                    "description": "",
                    "id": "e5c31c94-d1b4-440f-ae8d-e6de15d80dd8",
                    "name": "REFERENCE_LIST",
                    "shape": {
                        "class": "H5S_SIMPLE",
                        "dims": [
                            1
                        ],
                        "maxdims": [
                            1
                        ]
                    },
                    "type": {
                        "class": "H5T_COMPOUND",
                        "fields": [
                            {
                                "name": "dataset",
                                "type": {
                                    "base": "H5T_STD_REF_OBJ",
                                    "class": "H5T_REFERENCE"
                                }
                            },
                            {
                                "name": "index",
                                "type": {
                                    "base": "H5T_STD_I32LE",
                                    "class": "H5T_INTEGER"
                                }
                            }
                        ]
                    },
                    "value": [
                        [
                            "datasets/fd252e22-6231-11e5-b806-ec814d819d13",
                            0
                        ]
                    ]
                }
            ],
            "creationProperties": {
                "fillValue": 0.0,
                "layout": {
                    "class": "H5D_CHUNKED",
                    "dims": [
                        1024
                    ]
                }
            },
            "description": "",
            "shape": {
                "class": "H5S_SIMPLE",
                "dims": [
                    0
                ],
                "maxdims": [
                    "H5S_UNLIMITED"
                ]
            },
            "type": {
                "base": "H5T_IEEE_F32LE",
                "class": "H5T_FLOAT"
            }
        },
        "fd252e25-6231-11e5-8996-ec814d819d13": {
            "alias": [
                "/n"
            ],
            "attributes": [
                {
                    "creationProperties": {
                        "nameCharEncoding": "H5T_CSET_UTF8"
                    },
                    "description": "",
                    "id": "0589b03a-9d61-45e5-bc36-4e8a45337e0f",
                    "name": "CLASS",
                    "shape": {
                        "class": "H5S_SCALAR"
                    },
                    "type": {
                        "charSet": "H5T_CSET_ASCII",
                        "class": "H5T_STRING",
                        "length": 16,
                        "strPad": "H5T_STR_NULLTERM"
                    },
                    "value": "DIMENSION_SCALE"
                },
                {
                    "creationProperties": {
                        "nameCharEncoding": "H5T_CSET_UTF8"
                    },
                    "description": "",
                    "id": "41866844-f20c-4131-846b-aeb4bc6e5846",
                    "name": "REFERENCE_LIST",
                    "shape": {
                        "class": "H5S_SIMPLE",
                        "dims": [
                            1
                        ],
                        "maxdims": [
                            1
                        ]
                    },
                    "type": {
                        "class": "H5T_COMPOUND",
                        "fields": [
                            {
                                "name": "dataset",
                                "type": {
                                    "base": "H5T_STD_REF_OBJ",
                                    "class": "H5T_REFERENCE"
                                }
                            },
                            {
                                "name": "index",
                                "type": {
                                    "base": "H5T_STD_I32LE",
                                    "class": "H5T_INTEGER"
                                }
                            }
                        ]
                    },
                    "value": [
                        [
                            "datasets/fd252e23-6231-11e5-8cdb-ec814d819d13",
                            0
                        ]
                    ]
                }
            ],
            "creationProperties": {
                "fillValue": 0.0,
                "layout": {
                    "class": "H5D_CHUNKED",
                    "dims": [
                        1024
                    ]
                }
            },
            "description": "",
            "shape": {
                "class": "H5S_SIMPLE",
                "dims": [
                    0
                ],
                "maxdims": [
                    "H5S_UNLIMITED"
                ]
            },
            "type": {
                "base": "H5T_IEEE_F32LE",
                "class": "H5T_FLOAT"
            }
        }
    },
    "groups": {
        "ec06b81c-4538-48ad-b152-e905adc7f633": {
            "alias": [
                "/"
            ],
            "attributes": [],
            "description": "Group: /",
            "links": [
                {
                    "class": "H5L_TYPE_HARD",
                    "collection": "datasets",
                    "id": "fd252e22-6231-11e5-b806-ec814d819d13",
                    "title": "a"
                },
                {
                    "class": "H5L_TYPE_HARD",
                    "collection": "datasets",
                    "id": "fd252e23-6231-11e5-8cdb-ec814d819d13",
                    "title": "b"
                },
                {
                    "class": "H5L_TYPE_HARD",
                    "collection": "datasets",
                    "id": "fd252e24-6231-11e5-8986-ec814d819d13",
                    "title": "m"
                },
                {
                    "class": "H5L_TYPE_HARD",
                    "collection": "datasets",
                    "id": "fd252e25-6231-11e5-8996-ec814d819d13",
                    "title": "n"
                }
            ]
        }
    },
    "id": "ec06b81c-4538-48ad-b152-e905adc7f633",
    "root": "ec06b81c-4538-48ad-b152-e905adc7f633"
}
ghost commented 8 years ago

@hyoklee

should HPD server throw an error if DIMENSION_LIST size doesn't match the rank of dataset?

What is the JSON the app sends to the server when the DIMENSION_LIST attribute is created? What JSON is sent when an additional dimension scale is attached to the dataset?

hyoklee commented 8 years ago

created: "value": [[ "datasets/fd252e24-6231-11e5-8986-ec814d819d13"]]

attached: "value": [[ "datasets/fd252e24-6231-11e5-8986-ec814d819d13"], []]

If I remove [] manually using HPD, I can create the same template file.

ajelenak commented 2 years ago

Closing as not relevant any more.