tbeu / matio

MATLAB MAT File I/O Library
https://matio.sourceforge.io
BSD 2-Clause "Simplified" License
330 stars 97 forks source link

A problem with corrupted files #216

Open Ri0n opened 10 months ago

Ri0n commented 10 months ago

There is another thing I discovered with corrupted files.

matvar->data = calloc(nelems_x_nfields, matvar->data_size);

code like this may try to allocate enormous amount of memory. And I have an corrupted files which causes just this.

I thought to quickly fix it by adding the file size in mat_t struct and change all the allocations to check it. But since the file or some variable can be compressed it's hard to say what's the max size. Of course it's possible to use a multiplier to find approximate max of the max, but still it's sort of ugly. Another idea would be to introduce a sort of compile-time constant which will be checked on every allocation, but probably this option is even worse.

more complex solution would be

Ri0n commented 10 months ago

ah I just realized the line above allocates nelems_x_nfields of pointers to variables and for the actual data. Then it's definitely can be limited, most likely even in runtime.

Ri0n commented 10 months ago

Just an example which works for me.

diff --git a/src/mat.c b/src/mat.c
index 9c48578..20370db 100644
--- a/src/mat.c
+++ b/src/mat.c
@@ -506,6 +506,7 @@ Mat_Open(const char *matname, int mode)
     mat->version = 0;
     mat->byteswap = 0;
     mat->num_datasets = 0;
+    mat->struct_vars_limit = 0;
 #if defined(MAT73) && MAT73
     mat->refs_id = -1;
 #endif
@@ -2967,3 +2968,17 @@ Mat_VarWriteAppend(mat_t *mat, matvar_t *matvar, enum matio_compression compress

     return err;
 }
+
+/** @brief Limit max number of variables to allocate memory for
+ *
+ * This sets a limit on maximum number of variables which may be allocated for a struct.
+ * max number of variables is number of struct members * number of elements per member.
+ * By default there is not limit.
+ * @ingroup MAT
+ * @param mat MAT file
+ * @param limit max number of variables in the struct
+ */
+void Mat_VarSetSrtuctVarsLimit(mat_t *mat, size_t limit)
+{
+    mat->struct_vars_limit = limit;
+}
diff --git a/src/mat5.c b/src/mat5.c
index eb4b384..258b0d7 100644
--- a/src/mat5.c
+++ b/src/mat5.c
@@ -1468,6 +1468,10 @@ ReadNextStructField(mat_t *mat, matvar_t *matvar)
         if ( !matvar->nbytes )
             return bytesread;

+        if (mat->struct_vars_limit && nelems_x_nfields > mat->struct_vars_limit) {
+            Mat_Critical("Struct vars limit exceeded. Allocation failed");
+            return bytesread;
+        }
         matvar->data = calloc(nelems_x_nfields, matvar->data_size);
         if ( NULL == matvar->data ) {
             Mat_Critical("Couldn't allocate memory for the data");
@@ -1739,6 +1743,10 @@ ReadNextStructField(mat_t *mat, matvar_t *matvar)
         if ( !matvar->nbytes )
             return bytesread;

+        if (mat->struct_vars_limit && nelems_x_nfields > mat->struct_vars_limit) {
+            Mat_Critical("Struct vars limit exceeded. Allocation failed");
+            return bytesread;
+        }
         matvar->data = calloc(nelems_x_nfields, matvar->data_size);
         if ( NULL == matvar->data ) {
             Mat_Critical("Couldn't allocate memory for the data");
diff --git a/src/matio.h b/src/matio.h
index 8a4caeb..78e7168 100644
--- a/src/matio.h
+++ b/src/matio.h
@@ -347,6 +347,7 @@ EXTERN int Mat_VarWriteAppend(mat_t *mat, matvar_t *matvar, enum matio_compressi
 EXTERN int Mat_VarWriteInfo(mat_t *mat, matvar_t *matvar);
 EXTERN int Mat_VarWriteData(mat_t *mat, matvar_t *matvar, void *data, int *start, int *stride,
                             int *edge);
+EXTERN void Mat_VarSetSrtuctVarsLimit(mat_t *mat, size_t limit);

 /* Other functions */
 EXTERN int Mat_CalcSingleSubscript(int rank, int *dims, int *subs);
diff --git a/src/matio_private.h b/src/matio_private.h
index d6d22b6..51f9594 100644
--- a/src/matio_private.h
+++ b/src/matio_private.h
@@ -113,6 +113,7 @@ struct _mat_t
     mat_off_t bof;       /**< Beginning of file not including any header */
     size_t next_index;   /**< Index/File position of next variable to read */
     size_t num_datasets; /**< Number of datasets in the file */
+    size_t struct_vars_limit; /**< A limit to max number of variable in a struct. n_members * n_elems_per_member */
 #if defined(MAT73) && MAT73
     hid_t refs_id; /**< Id of the /#refs# group in HDF5 */
 #endif
tbeu commented 10 months ago

I wonder where you get those corrupted MAT files. I doubt it can be created by MATLAB save command.

Ri0n commented 10 months ago

I don't really know. Those come to me from other people. Maybe they have special software to generate all sort of invalid stuff just to check software stability.

tbeu commented 8 months ago

@Ri0n Do you think somethink like https://github.com/tbeu/matio/commit/0adfda6e129c2cedcae89a95918792e62fcd9bbd could also work here?

Ri0n commented 8 months ago

@tbeu I'm just worried about compressed stuff inside of mat files, which may trigger false positives for failure. But I'm not an expert to say where it's the case for this commit

vladimir1899 commented 8 months ago

Hello, @tbeu , I continue the Ri0n's mission. So, 0adfda6 does not work for us, since the problem arises later in the code, when we invoke function:

if ( matvar->class_type == MAT_C_STRUCT ) (void)ReadNextStructField(mat, matvar);

mat5.c:5479 in this commit

In ReadNextStructField we have matvar->data = calloc(nelems_x_nfields, matvar->data_size); were nelems_x_nfields comes from our corrupted file and is unrealistically big.

I tried to open this file in the Matlab and it immediately throw the error kinda: "you need 45 GB of memory to open the file, but only 32 GB are available".

May be it is good idea to follow the Matlab way and implement such check, something like that ?

// Get total memory ram in bytes
#if ( defined(_WIN64) || defined(_WIN32) ) && !defined(__CYGWIN__)
#include <windows.h>
long long getPhysMemSize()
{
    MEMORYSTATUSEX status;
    status.dwLength = sizeof(status);
    GlobalMemoryStatusEx(&status);
    return status.ullTotalPhys;
}
#define physRamVolumeBytes true;
#else
#include <sys/sysinfo.h>
long long getPhysMemSize()
{
    struct sysinfo info;
    sysinfo(&info);
    return info.totalram;
}
#define physRamVolumeBytes true;
#endif

// check before allocation
#ifdef physRamVolumeBytes
        if ( getPhysMemSize() < nelems_x_nfields * matvar->data_size )
        {
            Mat_Critical("Couldn't allocate memory for the data. Not enough memory");
            return bytesread;
        }
#endif
        matvar->data = calloc(nelems_x_nfields, matvar->data_size);