usnistgov / h5wasm

A WebAssembly HDF5 reader/writer library
Other
87 stars 12 forks source link

Add `strpad` to string dtype metadata #72

Closed axelboc closed 7 months ago

axelboc commented 7 months ago

We have a request to expose the padding information for string dtypes in H5Web: https://github.com/silx-kit/h5web/discussions/1612

bmaranville commented 7 months ago

Agreed that strings are tricky - this seems like it will be easy to do and I'm happy to implement.

bmaranville commented 7 months ago

How did I not notice that you already implemented this? I created another PR, which I will delete... On the plus side, our implementations were essentially the same so I guess we agree on the approach.

bmaranville commented 7 months ago

@axelboc - would you be ok with making cset and strpad optional in the Metadata interface, and then omitting those properties for non-string dtypes? Including them is cluttering up the objects for non-string dtypes for no good reason I can think of.

diff --git a/src/hdf5_util.cc b/src/hdf5_util.cc
index 59a8698..34e9672 100644
--- a/src/hdf5_util.cc
+++ b/src/hdf5_util.cc
@@ -263,11 +263,6 @@ val get_dtype_metadata(hid_t dtype)
         attr.set("cset", (int)(H5Tget_cset(dtype)));
         attr.set("strpad", (int)(H5Tget_strpad(dtype)));
     }
-    else
-    {
-        attr.set("cset", -1);
-        attr.set("strpad", -1);
-    }

     if (dtype_class == H5T_COMPOUND)
     {
@@ -1320,6 +1315,12 @@ EMSCRIPTEN_BINDINGS(hdf5)
         .value("H5T_VLEN", H5T_VLEN)           //      = 9,  /**< variable-Length types                   */
         .value("H5T_ARRAY", H5T_ARRAY)         //     = 10, /**< array types                             */
         ;
+    enum_<H5T_str_t>("H5T_str_t")
+        .value("H5T_STR_ERROR", H5T_STR_ERROR) // = -1, /**<error                                      */
+        .value("H5T_STR_NULLTERM", H5T_STR_NULLTERM) // = 0, /**<null-terminated                           */
+        .value("H5T_STR_NULLPAD", H5T_STR_NULLPAD) // = 1, /**<pad with nulls                            */
+        .value("H5T_STR_SPACEPAD", H5T_STR_SPACEPAD) // = 2, /**<pad with spaces                           */
+        ;

     //constant("H5L_type_t", H5L_type_t);
     // FILE ACCESS
diff --git a/src/hdf5_util_helpers.d.ts b/src/hdf5_util_helpers.d.ts
index 822b4a5..78d9aa8 100644
--- a/src/hdf5_util_helpers.d.ts
+++ b/src/hdf5_util_helpers.d.ts
@@ -17,12 +17,19 @@ export interface H5T_class_t {
     H5T_ARRAY: {value: 10}         // = 10, /**< array types                             */
 }

+export interface H5T_str_t {
+    H5T_STR_ERROR: {value: -1},    // = -1, /**< error */
+    H5T_STR_NULLTERM: {value: 0},  // = 0, /**< null-terminated, null-padding */
+    H5T_STR_NULLPAD: {value: 1},   // = 1, /**< null-terminated, null-padding */
+    H5T_STR_SPACEPAD: {value: 2},  // = 2, /**< space-padding */
+    H5T_STR_RESERVED_3: {value: 3} // = 3  /**< reserved for future use */
+}
+
 export interface Metadata {
     array_type?: Metadata,
     chunks: number[] | null,
     compound_type?: CompoundTypeMetadata,
-    cset: number,
-    strpad: number,
+    cset?: number,
     enum_type?: EnumTypeMetadata,
     littleEndian: boolean,
     maxshape: number[] | null,
@@ -30,6 +37,7 @@ export interface Metadata {
     shape: number[] | null,
     signed: boolean,
     size: number,
+    strpad?: H5T_str_t,
     total_size: number,
     type: number,
     vlen: boolean,
diff --git a/test/bool_test.mjs b/test/bool_test.mjs
index 955bda8..ef27ba6 100644
--- a/test/bool_test.mjs
+++ b/test/bool_test.mjs
@@ -14,8 +14,6 @@ async function bool_test() {

   assert.deepEqual(f.get('bool').metadata, {
     chunks: null,
-    cset: -1,
-    strpad: -1,
     enum_type: {
       type: 0,
       nmembers: 2,
diff --git a/test/compound_and_array_test.mjs b/test/compound_and_array_test.mjs
index 913bbf3..c728531 100644
--- a/test/compound_and_array_test.mjs
+++ b/test/compound_and_array_test.mjs
@@ -67,8 +67,6 @@ async function compound_array_test() {
       members: [
         {
           array_type: {
-            cset: -1,
-            strpad: -1,
             shape: [2, 2],
             littleEndian: true,
             signed: false,
@@ -77,8 +75,6 @@ async function compound_array_test() {
             type: 1,
             vlen: false,
           },
-          cset: -1,
-          strpad: -1,
           littleEndian: true,
           name: 'floaty',
           offset: 0,
@@ -91,17 +87,15 @@ async function compound_array_test() {
         {
           array_type: {
             cset: 0,
-            strpad: 1,
             shape: [2, 2],
             littleEndian: false,
             signed: false,
             size: 5,
+            strpad: 1,
             total_size: 4,
             type: 3,
             vlen: false,
           },
-          cset: -1,
-          strpad: -1,
           littleEndian: false,
           name: 'stringy',
           offset: 32,
@@ -114,8 +108,6 @@ async function compound_array_test() {
       ],
       nmembers: 2
     },
-    cset: -1,
-    strpad: -1,
     littleEndian: true,
     maxshape: [2],
     shape: [2],
axelboc commented 7 months ago

How did I not notice that you already implemented this? I created another PR, which I will delete...

Haha I did wonder. The PR description was formulated more like an issue, sorry. :sweat_smile: You could have kept your PR and closed this one, I would not have been offended :wink:

I'll push the diff above and maybe add an enum for cset. Totally agree with the approach of setting cset only when relevant. :100:

axelboc commented 7 months ago

In the end, I decided not to add the two enums for the time being. They're not technically needed in hdf5_util.cc (unlike H5T_class_t), and it didn't feel right having to redeclare them manually in hdf5_util_helpers.d.ts as strange enum-like interfaces. Would be good if this declaration file could be generated automatically with emcc --embind-emit-tsd, but this feels like a can of worms. :sweat_smile:

bmaranville commented 7 months ago

I want the end-user to have some way of interpreting the enums for cset and strpad, but I don't know what the most straightforward way to do that is. Using a single source of truth (the exported values from the C++ lib) seemed like a good idea, but the implementation is awkward. I agree that for the moment, just getting the correct functionality into the library is the most important.

bmaranville commented 7 months ago

I also haven't figured out how to get --embind-emit-tsd to work... it's a reasonably new feature, maybe it will get easier.