Open jimfulton opened 4 years ago
Hi,
I hit the same performance problem, I needed to run the treeinterpreter for 25k records on a RandomForest of 500 trees, which was taking 12s each.
I have profiled the code and it became clear that much of the time is spent of building all the paths of the trees (see https://github.com/andosa/treeinterpreter/blob/master/treeinterpreter/treeinterpreter.py#L12), which obviously is needed for the first predict()
run of the given model, but then it is a waste to redo it again for the same model just because we are predicting for a different record (as long as the model is the same).
As a quick-fix, I have monkey-patched the library by adding LRU cache to the _get_tree_paths
method and my per record prediction speed went from 12s to 2s.
Obviously the first record's prediction takes longer (23s instead of 12s), only consequitive predictions are faster due to the paths are being cached.
This isn't a nice solution - for that I would need to take the library apart and cache the paths for the trees - but more of a quick fix which can easily be applied via a monkey patch. See below.
from treeinterpreter import treeinterpreter as ti
from sklearn.tree import _tree
from functools import lru_cache
@lru_cache(maxsize=999999)
def _get_tree_paths(tree, node_id, depth=0):
"""
Returns all paths through the tree as list of node_ids
"""
if node_id == _tree.TREE_LEAF:
raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)
left_child = tree.children_left[node_id]
right_child = tree.children_right[node_id]
if left_child != _tree.TREE_LEAF:
left_paths = _get_tree_paths(tree, left_child, depth=depth + 1)
right_paths = _get_tree_paths(tree, right_child, depth=depth + 1)
for path in left_paths:
path.append(node_id)
for path in right_paths:
path.append(node_id)
paths = left_paths + right_paths
else:
paths = [[node_id]]
return paths
ti._get_tree_paths = _get_tree_paths
# now you can make your predictions normally:
ti_prediction, ti_bias, ti_contributions = ti.predict(model, record1) # this will be slow, as it will cache the trees' paths
ti_prediction, ti_bias, ti_contributions = ti.predict(model, record2) # this will be fast using the cached paths
ti_prediction, ti_bias, ti_contributions = ti.predict(model, record3) # this will be fast using the cached paths
The above will cause issues as in the _predict_tree()
method the paths is being mutated - which is a lists of lists, so it does mutate causing problems with recurring calls.
For this reason you might better by implementing the caching yourself and modifying the _predict_tree()
by moving the paths' reversal up before caching, so that object doesn't need to be mutated later, so its cached version doesn't mutate:
from treeinterpreter import treeinterpreter as ti
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, _tree
tree_paths = {}
def _get_tree_paths(tree, node_id, depth=0):
"""
Returns all paths through the tree as list of node_ids
"""
global tree_paths
id = f"{tree}-{node_id}"
if id in tree_paths:
return tree_paths[id]
else:
_paths = _get_tree_paths2(tree, node_id, depth=0)
for path in _paths:
path.reverse()
tree_paths[id] = _paths
return _paths
def _get_tree_paths2(tree, node_id, depth=0):
"""
Returns all paths through the tree as list of node_ids
"""
if node_id == _tree.TREE_LEAF:
raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)
left_child = tree.children_left[node_id]
right_child = tree.children_right[node_id]
if left_child != _tree.TREE_LEAF:
left_paths = _get_tree_paths2(tree, left_child, depth=depth + 1)
right_paths = _get_tree_paths2(tree, right_child, depth=depth + 1)
for path in left_paths:
path.append(node_id)
for path in right_paths:
path.append(node_id)
paths = left_paths + right_paths
else:
paths = [[node_id]]
return paths
def _predict_tree(model, X, joint_contribution=False):
"""
For a given DecisionTreeRegressor, DecisionTreeClassifier,
ExtraTreeRegressor, or ExtraTreeClassifier,
returns a triple of [prediction, bias and feature_contributions], such
that prediction ≈ bias + feature_contributions.
"""
leaves = model.apply(X)
paths = _get_tree_paths(model.tree_, 0)
# reversal should happen before caching
# for path in paths:
# path.reverse()
leaf_to_path = {}
# map leaves to paths
for path in paths:
leaf_to_path[path[-1]] = path
# remove the single-dimensional inner arrays
values = model.tree_.value.squeeze(axis=1)
# reshape if squeezed into a single float
if len(values.shape) == 0:
values = np.array([values])
if isinstance(model, DecisionTreeRegressor):
# we require the values to be the same shape as the biases
values = values.squeeze(axis=1)
biases = np.full(X.shape[0], values[paths[0][0]])
line_shape = X.shape[1]
elif isinstance(model, DecisionTreeClassifier):
# scikit stores category counts, we turn them into probabilities
normalizer = values.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
values /= normalizer
biases = np.tile(values[paths[0][0]], (X.shape[0], 1))
line_shape = (X.shape[1], model.n_classes_)
direct_prediction = values[leaves]
# make into python list, accessing values will be faster
values_list = list(values)
feature_index = list(model.tree_.feature)
contributions = []
if joint_contribution:
for row, leaf in enumerate(leaves):
path = leaf_to_path[leaf]
path_features = set()
contributions.append({})
for i in range(len(path) - 1):
path_features.add(feature_index[path[i]])
contrib = values_list[path[i + 1]] - \
values_list[path[i]]
# path_features.sort()
contributions[row][tuple(sorted(path_features))] = \
contributions[row].get(tuple(sorted(path_features)), 0) + contrib
return direct_prediction, biases, contributions
else:
unique_leaves = np.unique(leaves)
unique_contributions = {}
for row, leaf in enumerate(unique_leaves):
for path in paths:
if leaf == path[-1]:
break
contribs = np.zeros(line_shape)
for i in range(len(path) - 1):
contrib = values_list[path[i + 1]] - \
values_list[path[i]]
contribs[feature_index[path[i]]] += contrib
unique_contributions[leaf] = contribs
for row, leaf in enumerate(leaves):
contributions.append(unique_contributions[leaf])
return direct_prediction, biases, np.array(contributions)
ti._get_tree_paths = _get_tree_paths
ti._predict_tree = _predict_tree
# now you can make your predictions normally:
ti_prediction, ti_bias, ti_contributions = ti.predict(model, record1) # this will be slow, as it will cache the trees' paths
ti_prediction, ti_bias, ti_contributions = ti.predict(model, record2) # this will be fast using the cached paths
ti_prediction, ti_bias, ti_contributions = ti.predict(model, record3) # this will be fast using the cached paths
You could also cache the leaf_to_path
, that should shave off some more time.
Hi,
I'm working on a project where treeinterpreter is taking ~2 minutes per prediction.
I suppose this is because the implementation is in pure Python.
Do you know if anyone has looked at porting this to C (or cython or whatever) to make it go faster?
Jim