Closed liuguiyangnwpu closed 4 years ago
and another question: why the rrcf model is so slow to create a tree? I use feature is [5000, 3] to run 100 times. each created-tree spend 0.82 second.
Greetings,
With regards to your second question, I'd recommend keeping the tree size small. As the size of the tree increases, it becomes more inefficient to insert points, especially when the paths become deep. Notice how in the batch anomaly detection example, we are using trees containing 256 points each to sample a dataset of 2010 points. In other words, you don't need to have all the points in every tree.
With regards to your first question, consider serializing with a well-known and human-readable format like json. Here's an example. If this is useful, I can add these methods to RCTree
.
# Load modules
import numpy as np
import pandas as pd
import json
import rrcf
# Set parameters
np.random.seed(0)
n = 4
d = 3
# Generate data
X = np.random.randn(n, d)
# Construct tree
tree = rrcf.RCTree(X)
print(tree)
>>>
─+
├───+
│ ├──(3)
│ └───+
│ ├──(2)
│ └──(0)
└──(1)
def serialize(node, obj):
if isinstance(node, rrcf.Branch):
obj['type'] = 'Branch'
obj['q'] = int(node.q)
obj['p'] = float(node.p)
obj['n'] = int(node.n)
obj['b'] = node.b.tolist()
obj['l'] = {}
obj['r'] = {}
if node.l:
serialize(node.l, obj['l'])
if node.r:
serialize(node.r, obj['r'])
elif isinstance(node, rrcf.Leaf):
obj['type'] = 'Leaf'
obj['i'] = node.i
obj['x'] = node.x.tolist()
obj['d'] = int(node.d)
obj['n'] = int(node.n)
else:
raise TypeError('`node` must be Branch or Leaf instance')
# Create empty dict
obj = {}
# Serialize tree to dict
serialize(tree.root, obj)
# Dump to json file
with open('tree.json', 'w') as outfile:
json.dump(obj, outfile)
# Read json file
with open('tree.json', 'r') as infile:
obj = json.load(infile)
print(obj)
>>>
{'b': [[0.410598502, -0.151357208, -0.97727788],
[2.240893199, 1.86755799, 1.454273507]],
'l': {'b': [[0.410598502, -0.151357208, -0.103218852],
[1.764052346, 0.400157208, 1.454273507]],
'l': {'d': 2,
'i': 3,
'n': 1,
'type': 'Leaf',
'x': [0.410598502, 0.144043571, 1.454273507]},
'n': 3,
'p': 0.5285239876060783,
'q': 0,
'r': {'b': [[0.950088418, -0.151357208, -0.103218852],
[1.764052346, 0.400157208, 0.978737984]],
'l': {'d': 3,
'i': 2,
'n': 1,
'type': 'Leaf',
'x': [0.950088418, -0.151357208, -0.103218852]},
'n': 2,
'p': 1.6278109380129528,
'q': 0,
'r': {'d': 3,
'i': 0,
'n': 1,
'type': 'Leaf',
'x': [1.764052346, 0.400157208, 0.978737984]},
'type': 'Branch'},
'type': 'Branch'},
'n': 4,
'p': 1.7173439122667618,
'q': 1,
'r': {'d': 1,
'i': 1,
'n': 1,
'type': 'Leaf',
'x': [2.240893199, 1.86755799, -0.97727788]},
'type': 'Branch'}
def deserialize(obj, node, side='l'):
if obj['type'] == 'Branch':
q = obj['q']
p = obj['p']
n = np.int64(obj['n'])
b = np.asarray(obj['b'])
branch = rrcf.Branch(q=q, p=p, n=n, b=b, u=node)
setattr(node, side, branch)
if 'l' in obj:
deserialize(obj['l'], branch, side='l')
if 'r' in obj:
deserialize(obj['r'], branch, side='r')
elif obj['type'] == 'Leaf':
i = obj['i']
x = np.asarray(obj['x'])
d = obj['d']
n = np.int64(obj['n'])
leaf = rrcf.Leaf(i=i, x=x, d=d, n=n, u=node)
setattr(node, side, leaf)
else:
raise TypeError('`type` must be Branch or Leaf')
# Create anchor node
anchor = rrcf.Branch(q=None, p=None)
# Deserialize json object
deserialize(obj, anchor)
# Get root node
root = anchor.l
root.u = None
# Create tree and assign root node
tree = rrcf.RCTree()
tree.root = root
# Fill in leaves dict
tree.map_leaves(tree.root, op=(lambda x, leaves: leaves.update({x.i : x})),
leaves=tree.leaves)
# Set number of dimensions based on first leaf
tree.ndim = len(next(iter(tree.leaves.values())).x)
print(tree)
>>>
─+
├───+
│ ├──(3)
│ └───+
│ ├──(2)
│ └──(0)
└──(1)
Greetings,
With regards to your second question, I'd recommend keeping the tree size small. As the size of the tree increases, it becomes more inefficient to insert points, especially when the paths become deep. Notice how in the batch anomaly detection example, we are using trees containing 256 points each to sample a dataset of 2010 points. In other words, you don't need to have all the points in every tree.
Right, I use the max_depth to limit the each tree depth, it much more faster. thank you for your reply.
With regards to your first question, consider serializing with a well-known and human-readable format like json. Here's an example. If this is useful, I can add these methods to
RCTree
.Create example tree
# Load modules import numpy as np import pandas as pd import json import rrcf # Set parameters np.random.seed(0) n = 4 d = 3 # Generate data X = np.random.randn(n, d) # Construct tree tree = rrcf.RCTree(X)
print(tree) >>> ─+ ├───+ │ ├──(3) │ └───+ │ ├──(2) │ └──(0) └──(1)
Serialize to json
def serialize(node, obj): if isinstance(node, rrcf.Branch): obj['type'] = 'Branch' obj['q'] = int(node.q) obj['p'] = float(node.p) obj['n'] = int(node.n) obj['b'] = node.b.tolist() obj['l'] = {} obj['r'] = {} if node.l: serialize(node.l, obj['l']) if node.r: serialize(node.r, obj['r']) elif isinstance(node, rrcf.Leaf): obj['type'] = 'Leaf' obj['i'] = node.i obj['x'] = node.x.tolist() obj['d'] = int(node.d) obj['n'] = int(node.n) else: raise TypeError('`node` must be Branch or Leaf instance')
# Create empty dict obj = {} # Serialize tree to dict serialize(tree.root, obj) # Dump to json file with open('tree.json', 'w') as outfile: json.dump(obj, outfile) # Read json file with open('tree.json', 'r') as infile: obj = json.load(infile)
print(obj) >>> {'b': [[0.410598502, -0.151357208, -0.97727788], [2.240893199, 1.86755799, 1.454273507]], 'l': {'b': [[0.410598502, -0.151357208, -0.103218852], [1.764052346, 0.400157208, 1.454273507]], 'l': {'d': 2, 'i': 3, 'n': 1, 'type': 'Leaf', 'x': [0.410598502, 0.144043571, 1.454273507]}, 'n': 3, 'p': 0.5285239876060783, 'q': 0, 'r': {'b': [[0.950088418, -0.151357208, -0.103218852], [1.764052346, 0.400157208, 0.978737984]], 'l': {'d': 3, 'i': 2, 'n': 1, 'type': 'Leaf', 'x': [0.950088418, -0.151357208, -0.103218852]}, 'n': 2, 'p': 1.6278109380129528, 'q': 0, 'r': {'d': 3, 'i': 0, 'n': 1, 'type': 'Leaf', 'x': [1.764052346, 0.400157208, 0.978737984]}, 'type': 'Branch'}, 'type': 'Branch'}, 'n': 4, 'p': 1.7173439122667618, 'q': 1, 'r': {'d': 1, 'i': 1, 'n': 1, 'type': 'Leaf', 'x': [2.240893199, 1.86755799, -0.97727788]}, 'type': 'Branch'}
Deserialize and load into empty tree
def deserialize(obj, node, side='l'): if obj['type'] == 'Branch': q = obj['q'] p = obj['p'] n = np.int64(obj['n']) b = np.asarray(obj['b']) branch = rrcf.Branch(q=q, p=p, n=n, b=b, u=node) setattr(node, side, branch) if 'l' in obj: deserialize(obj['l'], branch, side='l') if 'r' in obj: deserialize(obj['r'], branch, side='r') elif obj['type'] == 'Leaf': i = obj['i'] x = np.asarray(obj['x']) d = obj['d'] n = np.int64(obj['n']) leaf = rrcf.Leaf(i=i, x=x, d=d, n=n, u=node) setattr(node, side, leaf) else: raise TypeError('`type` must be Branch or Leaf')
# Create anchor node anchor = rrcf.Branch(q=None, p=None) # Deserialize json object deserialize(obj, anchor) # Get root node root = anchor.l root.u = None # Create tree and assign root node tree = rrcf.RCTree() tree.root = root # Fill in leaves dict tree.map_leaves(tree.root, op=(lambda x, leaves: leaves.update({x.i : x})), leaves=tree.leaves) # Set number of dimensions based on first leaf tree.ndim = len(next(iter(tree.leaves.values())).x)
print(tree) >>> ─+ ├───+ │ ├──(3) │ └───+ │ ├──(2) │ └──(0) └──(1)
thank you for your json Serialize/Deserialize method, I will create large forest to benchmark the serialize efficient.
Dill, Pickle, Json
I want to know about the reason why the pickle dumps method not working. When I use from multiprocessing import Pool, Manager, the Thread Manager use Pickle to send bytes. But it's always failing send the dumps objects.
Try running pickle.dumps(tree.root)
instead of pickle.dumps(tree)
.
Note that when you load the pickle, you will need to add the root of the loaded object to an empty tree. So, similar to the above code:
# Load pickle file
with open('tree.p', 'rb') as p:
root = pickle.load(p)
# Create tree and assign root node
tree = rrcf.RCTree()
tree.root = root
# Fill in leaves dict
tree.map_leaves(tree.root, op=(lambda x, leaves: leaves.update({x.i : x})),
leaves=tree.leaves)
# Set number of dimensions based on first leaf
tree.ndim = len(next(iter(tree.leaves.values())).x)
Hi, In isolate forest, there is a parameter named max_depth, which to control the depth about the isolate tree (also, It can speed up the tree created). Why not provide that parameter ?
Hi mdb, when I use pickle to dump the tree, it was throw the exception about error message. So, I use dill package to dumps. How to fix it ?