H5 attempt:

import numpy as np
import h5py
import os
from tqdm import tqdm
import pandas as pd

def save_dict_to_hdf5(dic, filename):
    """
    ....
    """
    with pd.HDFStore(filename, 'w') as h5file:
        # with h5py.File(filename, 'w') as h5file:
        recursively_save_dict_contents_to_group(h5file, '/', dic)

def recursively_save_dict_contents_to_group(h5file, path, dic):
    """
    ....
    """
    for key, item in dic.items():
        # if isinstance(item, (np.ndarray, np.int64, np.float64, str, bytes, int, float)):
        #     h5file[path + key] = item
        # elif isinstance(item, pd.DataFrame):
        #     item.to_hdf(h5file, key=path + key)
        if isinstance(item, dict):
            recursively_save_dict_contents_to_group(h5file, path + key + '/', item)
        else:
            h5file[path + key] = item

def load_dict_from_hdf5(filename):
    """
    ....
    """
    with pd.HDFStore(filename, 'r+') as h5file:
        return recursively_load_dict_contents_from_group(h5file, '/')

def recursively_load_dict_contents_from_group(h5file, path):
    """
    ....
    """
    keys = h5file.keys()
    keys_split = [k.split("/")[1:] for k in keys]

    def make_path(d: dict, paths: list) -> None:
        for key in paths:
            d = d.setdefault(key, {})

    d = {}

    for a in keys_split:
        make_path(d, a)

    for key, key_s in zip(keys, keys_split):
        item = h5file.get(key)

        last = d
        for i, ks in enumerate(key_s):
            if i == len(key_s) - 1:
                last[ks] = item
            else:
                last = d[ks]

    return d

def dummy_dict():
    temp = {}

    for _ in range(1000):

        temp1 = {}
        for i in range(25):
            temp1[str(i)] = pd.Series(i)

        temp2 = {}
        for j in range(200):
            temp2[str(j)] = pd.DataFrame(np.random.randn(14, 20))

        temp[str(_)] = {"1": temp1, "2": temp2}

    return temp

if __name__ == '__main__':

    data = dummy_dict()
    print(len(data.keys()))

    # data = {'x': 10,
    #         'y': np.arange(10),
    #         'd': {'x': np.ones((2, 3)),
    #               'b': 10.5}}
    #
    # data = {"x": pd.DataFrame({"hallo": [10, 12, 14], "jo": [423, 5436, 2]}),
    #         "y": {"a": pd.Series(), "b": pd.Series()}}
    #
    print("writing")
    filename = 'test.h5'
    # save_dict_to_hdf5(data, filename)
    dd = load_dict_from_hdf5(filename)
    # print(dd)
lucasfbn / Reddit-Sentiment-Reinforcement-Learning

Change data format and implement a data loader (yield) such that multiple runs can be run in parallel easily #158

H5 attempt: