Would you take a caching decorator? I'm now using this during my "data cleanup" phase so that I can run my whole stack with one function call in the end but don't spend unnessesary time to redo computations which could potentially take a few hours.
I ended up with this thing here:
def make_hash_single(item):
if isinstance(item, pd.DataFrame):
index = tuple(item.index)
columns = tuple(item.columns)
values = tuple(tuple(x) for x in item.values)
item = tuple([index, columns, values])
elif isinstance(item, pd.Series):
index = tuple(item.index)
values = tuple(tuple(x) for x in item.values)
item = tuple([index, values])
try:
return hash(item)
except TypeError:
try:
# this might act funny if a thing is convertible to tuple but the tuple
# is not a proper representation for the item (like for a frame :-()
return hash(tuple(item))
except TypeError as e:
print("Unhashable type: %s, %s" % (item, [type(t) for t in tuple(item)]))
raise e
def make_hash(*args, **kwargs):
h = 13
for arg in args:
h = h ^ make_hash_single(arg)
for key in kwargs:
h = h ^ make_hash_single(key)
h = h ^ make_hash_single(kwargs[key])
return abs(h)
class memoized_DataFrame(object):
def __init__(self, cache_dir=".", store_name=None, prefix=None):
"""Cache the returned DataFrame from a function.
The decorator will cache a returned DataFrame in a HDF store
and if called with the same arguments will return the cached
DataFrame.
Values are stored as <prefix>+'-'+hash(<inputs>) in the HSF
store with the name <cache_dir>/<store_name>.
The inputs have to be hasable. DataFrames and Series are
converted to tuples to get a runtime hash of the content.
If you change the function you have to delete the value from the
the store by yourself!
cache_dir: str (default: ".")
Name of the cache dir, where the HSF store should be placed
store_name: str (default: name of the function + '.hdf5')
Name of the HDF store where values will be stored
prefix: str (default: name of the function)
prefix for the lookup in the HDF store
"""
self.store_name = store_name
self.prefix = prefix
self.cache_dir = cache_dir
def __call__(self, func):
def wrapped(*args, **kwargs):
import os
import pandas as pd
from pandas.io.pytables import get_store
try:
h = make_hash(*args, **kwargs)
except TypeError:
# unhashable
args_t = [type(a) for a in args]
kwargs_t = [type(kwargs[k]) for k in kwargs]
import warnings
warnings.warn("unhashable type: %s, %s" % (args_t, kwargs_t))
return func(*args, **kwargs)
func_name = func.__name__
lookup_name = (self.prefix or func_name) + "_" + str(h)
store_name = os.path.join(self.cache_dir,(self.store_name or func_name + ".hdf5"))
# lookup cache value
try:
with get_store(store_name) as store:
df = store[lookup_name]
# found, so just return
print("Using cached value '%s' from store '%s'." % (lookup_name, store_name))
return df
except KeyError:
# not found, so compute, cache, and return
ret = func(*args, **kwargs)
if not isinstance(ret, pd.DataFrame):
import warnings
warnings.warn("Function '%s' did not produce a DataFrame" % func_name)
return ret
with get_store(store_name) as store:
store[lookup_name] = ret
print("Caching value as '%s' in store '%s'" % (lookup_name, store_name))
return ret
wrapped.__doc__ = func.__doc__
return wrapped
Would you take a caching decorator? I'm now using this during my "data cleanup" phase so that I can run my whole stack with one function call in the end but don't spend unnessesary time to redo computations which could potentially take a few hours.
I ended up with this thing here:
Usage: