engarde-dev / engarde

A library for defensive data analysis.
MIT License
500 stars 40 forks source link

caching decorator? #3

Closed jankatins closed 2 years ago

jankatins commented 9 years ago

Would you take a caching decorator? I'm now using this during my "data cleanup" phase so that I can run my whole stack with one function call in the end but don't spend unnessesary time to redo computations which could potentially take a few hours.

I ended up with this thing here:

def make_hash_single(item):
    if isinstance(item, pd.DataFrame):
        index = tuple(item.index)
        columns = tuple(item.columns)
        values = tuple(tuple(x) for x in item.values)
        item = tuple([index, columns, values])
    elif isinstance(item, pd.Series):
        index = tuple(item.index)
        values = tuple(tuple(x) for x in item.values)
        item = tuple([index, values])

    try:
        return hash(item)
    except TypeError:
        try:
            # this might act funny if a thing is convertible to tuple but the tuple 
            # is not a proper representation for the item (like for a frame :-()
            return hash(tuple(item)) 
        except TypeError as e:
            print("Unhashable type: %s, %s" % (item, [type(t) for t in tuple(item)]))
            raise e

def make_hash(*args, **kwargs):
    h = 13
    for arg in args:        
        h = h ^ make_hash_single(arg)
    for key in kwargs:
        h = h ^ make_hash_single(key)
        h = h ^ make_hash_single(kwargs[key])
    return abs(h)

class memoized_DataFrame(object):

    def __init__(self, cache_dir=".",  store_name=None, prefix=None):
        """Cache the returned DataFrame from a function.

        The decorator will cache a returned DataFrame in a HDF store 
        and if called with the same arguments will return the cached 
        DataFrame.

        Values are stored as <prefix>+'-'+hash(<inputs>) in the HSF 
        store with the name <cache_dir>/<store_name>.

        The inputs have to be hasable. DataFrames and Series are
        converted to tuples to get a runtime hash of the content.

        If you change the function you have to delete the value from the 
        the store by yourself!

        cache_dir: str (default: ".")
            Name of the cache dir, where the HSF store should be placed

        store_name: str (default: name of the function + '.hdf5')
            Name of the HDF store where values will be stored

        prefix: str (default: name of the function)
            prefix for the lookup in the HDF store
        """
        self.store_name = store_name
        self.prefix = prefix
        self.cache_dir = cache_dir

    def __call__(self, func):
        def wrapped(*args, **kwargs):
            import os
            import pandas as pd
            from pandas.io.pytables import get_store
            try:
                h = make_hash(*args, **kwargs)
            except TypeError:
                # unhashable
                args_t = [type(a) for a in args]
                kwargs_t = [type(kwargs[k]) for k in kwargs]
                import warnings
                warnings.warn("unhashable type: %s, %s" % (args_t, kwargs_t))
                return func(*args, **kwargs)

            func_name = func.__name__
            lookup_name = (self.prefix or func_name) + "_" + str(h)
            store_name = os.path.join(self.cache_dir,(self.store_name or func_name + ".hdf5"))

            # lookup cache value
            try:
                with get_store(store_name) as store:
                    df = store[lookup_name]
                # found, so just return
                print("Using cached value '%s' from store '%s'." % (lookup_name, store_name))
                return df
            except KeyError:
                # not found, so compute, cache, and return
                ret = func(*args, **kwargs)
                if not isinstance(ret, pd.DataFrame):
                    import warnings
                    warnings.warn("Function '%s' did not produce a DataFrame" % func_name)
                    return ret

                with get_store(store_name) as store:
                    store[lookup_name] = ret
                print("Caching value as '%s' in store '%s'" % (lookup_name, store_name))
                return ret

        wrapped.__doc__ = func.__doc__
        return wrapped

Usage:

@memoized_DataFrame(cache_dir="temp")
def function2(whatever="whatever"):
    return whatever

function2()
TomAugspurger commented 9 years ago

Thanks. I'll look at this later. Have you seen joblib's caching stuff and do you know how your code compares with that?

jankatins commented 9 years ago

@TomAugspurger actually not, thanks! I will have a look and report back.