HDFGroup / hdf5

Official HDF5® Library Repository
Other
577 stars 236 forks source link

Segmentation violation with H5Lget_info with HDF5 1.14.4.3 #4644

Open abhibaruah opened 1 month ago

abhibaruah commented 1 month ago

I have a dataset named 'OBJECT_REFERENCES' of type H5T_REFERENCE. I have a workflow (derived from one of our tests) where I read the reference dataset and then call H5Rdereference and H5Lget_info on a subset of the reference dataset.

The code below works with HDF5 1.10.11 but with 1.14.4.3, I see a segmentation violation during the call to H5Lget_info. I see the segV for both H5Lget_info1 and H5Lget_info2 (in v1.14.4.3).

Can someone take a look at it and let me know if this is a bug, and if there is a workaround to revert to the non segV behavior?

Link to the h5 file for reproduction: https://mathworks-my.sharepoint.com/:u:/p/abaruah/EdkeRkHv9SZMgEDGfvULYR4BcaBMX8bI9Q-UfylNeqMyog?e=ulkR7K

#include "hdf5.h"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include "string.h"

#define FILE1           "h5treferences.h5"
#define DATASET1        "/OBJECT_REFERENCES"
#define DIM10            4
#define DIM11            8

int main() {
    hid_t           file, space, dset, dcpl;    /* Handles */
    herr_t          status;

    uint8_t* ref = new uint8_t[DIM10 * DIM11];
    /*
     * Create a new file using the default properties.
     */
    file = H5Fopen(FILE1, H5F_ACC_RDONLY, H5P_DEFAULT);

    /*
     * Create the chunked dataset.
     */
    dset = H5Dopen(file, DATASET1, H5P_DEFAULT);

    hid_t  h_type = H5Dget_type(dset);
    hid_t my_file_space_id = H5Dget_space(dset);
    hid_t my_mem_space_id = H5Dget_space(dset);
    hid_t xfer_plist_id = H5Pcreate(H5P_DATASET_XFER);

    //dset = H5Dcreate1 (file, DATASET, H5T_NATIVE_DOUBLE, space, dcpl);

    /*
     * Write the data to the dataset.
     */
    status = H5Dread(dset, h_type, my_mem_space_id, my_file_space_id, xfer_plist_id,
        ref);
    std::cout << "Read Status: " << status <<std::endl;

    //for (int i = 0; i<32; i++){
    //  std::cout << (int)*(ref + i) <<std::endl;
    //} 
    uint8_t* ref_subset = &ref[3*DIM11];

    //for (int i = 0; i<8; i++){
    //  std::cout << (int)*(ref_subset + i) <<std::endl;
    //}

    H5R_type_t ref_type = H5R_OBJECT;

    ssize_t name_len = H5Rget_name(file,ref_type,ref_subset,NULL,0);

    if ( name_len < 0 ) {
        std::cout << "Error 1" << std::endl;
    }

    std::string name;
    name.resize(name_len+1);
    name_len = H5Rget_name(file,ref_type,ref_subset,&name[0],name_len+1);
    name.resize(name_len);

    H5T_cset_t cset = H5T_CSET_ASCII;

    if (name != "/") {
        // Get the object's ID to determine its encoding
        hid_t obj_id = H5Rdereference1(file, ref_type, ref_subset);
        if ( obj_id >= 0 ) {
            std::cout << "H5Rdereference1 success!" <<std::endl;
        }

        // Get link info to know how to interpret name
        H5L_info_t linfo;
        herr_t status = H5Lget_info(obj_id, name.c_str(), &linfo, H5P_DEFAULT);
        if ( status >= 0 ) {
            std::cout << "H5Lget_info success!" <<std::endl;
        }
        cset = linfo.cset;

        // Don't leak resource
        status = H5Oclose(obj_id);
        if ( status >= 0 ) {
            std::cout << "H5Oclose success!" <<std::endl; // Shouldn't happen
        }
    }

    status = H5Dclose(dset);
    status = H5Fclose(file);

    std::cout << "Success!!" <<std::endl;
    delete [] ref;

    return 0;
}
mattjala commented 1 month ago

I'm able to replicate this crash. The root cause likely has something to do with trying to use the old file format, as running it with the debug library and valgrind shows that H5Lget_info2() fails at H5VL_set_vol_wrapper()'s assertion that vol_obj->data exists.

Adaptation of the test script for the C API:


#include "hdf5.h"
#include <stdio.h>
#include <stdlib.h>
#include "string.h"

#define FILE1 "h5treferences.h5"
#define DATASET1 "/OBJECT_REFERENCES"
#define DIM10 4
#define DIM11 8

int main() {
  hid_t file, space, dset, dcpl; /* Handles */
  herr_t status;

  uint8_t ref[DIM10 * DIM11];

  /*
   * Create a new file using the default properties.
   */
  file = H5Fopen(FILE1, H5F_ACC_RDONLY, H5P_DEFAULT);

  /*
   * Create the chunked dataset.
   */
  dset = H5Dopen(file, DATASET1, H5P_DEFAULT);

  hid_t h_type = H5Dget_type(dset);
  hid_t my_file_space_id = H5Dget_space(dset);
  hid_t my_mem_space_id = H5Dget_space(dset);
  hid_t xfer_plist_id = H5Pcreate(H5P_DATASET_XFER);

  /*
   * Write the data to the dataset.
   */
  status = H5Dread(dset, h_type, my_mem_space_id, my_file_space_id, xfer_plist_id, ref);

  size_t type_size = H5Tget_size(h_type);

  printf("Type size: %zu\n", type_size);
  printf("Read Status: %d\n", status);

  uint8_t *ref_subset = &ref[3 * DIM11];

  H5R_type_t ref_type = H5R_OBJECT;
  ssize_t name_len = H5Rget_name(file, ref_type, ref_subset, NULL, 0);

  if (name_len < 0) {
    printf("Error 1\n");
  }

  char *name = NULL;
  name_len = H5Rget_name(file, ref_type, ref_subset, &name[0], name_len + 1);

  printf("Allocating %zu bytes for name\n", name_len + 1);
  name = malloc(name_len + 1);
  name_len = H5Rget_name(file, ref_type, ref_subset, &name[0], name_len + 1);
  printf("name = %s\n", name);

  H5T_cset_t cset = H5T_CSET_ASCII;

  if (name != "/") {
    // Get the object's ID to determine its encoding
    hid_t obj_id = H5Rdereference(file, H5P_DEFAULT, ref_type, ref_subset);
    if (obj_id >= 0) {
      printf("H5Rdereference1 success! \n");
    }

    // Get link info to know how to interpret name
    H5L_info_t linfo;
    herr_t status = H5Lget_info2(obj_id, name, &linfo, H5P_DEFAULT);
    if (status >= 0) {
      printf("H5Lget_info success! \n");
    }
    cset = linfo.cset;

    // Don't leak resource
    status = H5Oclose(obj_id);
    if (status >= 0) {
      printf("H5Oclose success! (Shouldn't happen) \n");
    }
  }

  status = H5Dclose(dset);
  status = H5Fclose(file);

  printf("Success!! \n");

  free(name);
  return 0;
}
bmribler commented 1 month ago

@abhibaruah I believe you need to use "ref" like this: hobj_ref_t ref_obj; <- I just used a different name for distinction ref_obj = new hobj_ref_t[sizeof(hobj_ref_t) DIM10];

instead of this: uint8_t ref = new uint8_t[DIM10 DIM11];

Then you pass ref_obj into H5Dread and pass &ref_obj[0] (or 1 or 2) into H5Rget_name. Please let us know if that works.

abhibaruah commented 1 month ago

Thanks @bmribler. I replaced uint8_t ref = new uint8_t[DIM10 DIM11];

with

hobj_ref_t ref; ref = new hobj_ref_t[sizeof(hobj_ref_t) DIM10];

and

uint8_t ref_subset = &ref[3DIM11]; with hobj_ref_t* ref_subset = &ref[3];

However, I am still seeing the segmentation fault.

My code:

#include "hdf5.h"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include "string.h"

#define FILE1           "h5treferences.h5"
#define DATASET1        "/OBJECT_REFERENCES"
#define DIM10            4
#define DIM11            8

int main() {
    hid_t           file, space, dset, dcpl;    /* Handles */
    herr_t          status;

    //uint8_t* ref = new uint8_t[DIM10 * DIM11];
    hobj_ref_t *ref;
    ref = new hobj_ref_t[sizeof(hobj_ref_t) * DIM10];

    /*
     * Create a new file using the default properties.
     */
    file = H5Fopen(FILE1, H5F_ACC_RDONLY, H5P_DEFAULT);

    /*
     * Create the chunked dataset.
     */
    dset = H5Dopen(file, DATASET1, H5P_DEFAULT);

    hid_t  h_type = H5Dget_type(dset);
    hid_t my_file_space_id = H5Dget_space(dset);
    hid_t my_mem_space_id = H5Dget_space(dset);
    hid_t xfer_plist_id = H5Pcreate(H5P_DATASET_XFER);

    //dset = H5Dcreate1 (file, DATASET, H5T_NATIVE_DOUBLE, space, dcpl);

    /*
     * Write the data to the dataset.
     */
    status = H5Dread(dset, h_type, my_mem_space_id, my_file_space_id, xfer_plist_id,
        ref);
    std::cout << "Read Status: " << status <<std::endl;

    //for (int i = 0; i<32; i++){
    //  std::cout << (int)*(ref + i) <<std::endl;
    //} 
    //uint8_t* ref_subset = &ref[2*DIM11];
    hobj_ref_t* ref_subset = &ref[3];

    //for (int i = 0; i<8; i++){
    //  std::cout << (int)*(ref_subset + i) <<std::endl;
    //}

    H5R_type_t ref_type = H5R_OBJECT;

    ssize_t name_len = H5Rget_name(file,ref_type,ref_subset,NULL,0);

    if ( name_len < 0 ) {
        std::cout << "Error 1" << std::endl;
    }

    std::string name;
    name.resize(name_len+1);
    name_len = H5Rget_name(file,ref_type,ref_subset,&name[0],name_len+1);
    name.resize(name_len);

    H5T_cset_t cset = H5T_CSET_ASCII;

    if (name != "/") {
        // Get the object's ID to determine its encoding
        hid_t obj_id = H5Rdereference1(file, ref_type, ref_subset);
        if ( obj_id >= 0 ) {
            std::cout << "H5Rdereference1 success!" <<std::endl;
        }
        std::cout << "Name: " << name <<std::endl;
        // Get link info to know how to interpret name
        H5L_info_t linfo;
        herr_t status = H5Lget_info(obj_id, name.c_str(), &linfo, H5P_DEFAULT);
        if ( status >= 0 ) {
            std::cout << "H5Lget_info success!" <<std::endl;
        }
        cset = linfo.cset;

        // Don't leak resource
        status = H5Oclose(obj_id);
        if ( status >= 0 ) {
            std::cout << "H5Oclose success!" <<std::endl; // Shouldn't happen
        }
    }

    status = H5Dclose(dset);
    status = H5Fclose(file);

    std::cout << "Success!!" <<std::endl;
    delete [] ref;

    return 0;
}
bmribler commented 1 month ago

Hi @abhibaruah, It appeared that the segfault happened here: herr_t status = H5Lget_info(obj_id, name.c_str(), &linfo, H5P_DEFAULT); when obj_id, which is a datatype ID in this case, was passed in.

I'm aware that the documentation says: "The identifier may be that of a file, group, dataset, named datatype, or attribute." However, it doesn't seem to work properly. While I'm getting advice from the developers, you can by pass this segfault by passing in the file ID for loc_id instead. Please note that the obj_id (aka datatype ID) can still be passed in to other APIs such as H5Tget_class, H5Tget_size, H5Oget_info2, etc...

abhibaruah commented 1 month ago

Thanks @bmribler. Replacing obj_id withthe file ID helped to resolve the segV and work around it.

bmribler commented 3 days ago

FYI, @abhibaruah, this issue has been fixed (PR #4733)