A current pain point is that for parallel queries of faces/boxes/edges, the tree is traversed twice: once to count and allocate, and again to actually store results. This is primarily because we have no threadsafe list or something to append to, across threads.
A simpler scheme, however, is to chunk by the number of threads, create a (C++-like) vector for each, and append to that.
I've setup a implementation:
In `query.py`:
```python
@nb.njit(inline="always")
def locate_box_parallel(box: Box, tree: CellTreeData, indices: IntArray, indices_size: int, index: int) -> Tuple[int, int]:
tree_bbox = as_box(tree.bbox)
if not boxes_intersect(box, tree_bbox):
return 0, indices, indices_size
stack = allocate_stack()
stack[0] = 0
size = 1
count = 0
while size > 0:
node_index, size = pop(stack, size)
node = tree.nodes[node_index]
# Check if it's a leaf
if node["child"] == -1:
# Iterate over the bboxes in the leaf
for i in range(node["ptr"], node["ptr"] + node["size"]):
bbox_index = tree.bb_indices[i]
# As a named tuple: saves about 15% runtime
leaf_box = as_box(tree.bb_coords[bbox_index])
if boxes_intersect(box, leaf_box):
# Re-allocate if needed.
indices_length = indices.shape[0]
if indices_size >= indices_length:
new = np.empty((indices_length * 2, 2), dtype=IntDType)
new[: indices_length, :] = indices[:, :]
indices = new
indices[indices_size, 0] = index
indices[indices_size, 1] = bbox_index
indices_size += 1
count += 1
else:
dim = 1 if node["dim"] else 0
minimum = 2 * dim
maximum = 2 * dim + 1
left = box[minimum] <= node["Lmax"]
right = box[maximum] >= node["Rmin"]
left_child = node["child"]
right_child = left_child + 1
if left and right:
stack, size = push(stack, left_child, size)
stack, size = push(stack, right_child, size)
elif left:
stack, size = push(stack, left_child, size)
elif right:
stack, size = push(stack, right_child, size)
return count, indices, indices_size
@nb.njit(cache=True)
def locate_boxes_parallel(
box_coords: FloatArray,
tree: CellTreeData,
) -> IntArray:
"""Mutates box_indices and tree_indices."""
n_box = box_coords.shape[0]
indices = np.empty((n_box, 2), dtype=IntDType)
total_count = 0
indices_size = 0
for box_index in range(n_box):
box = as_box(box_coords[box_index])
count, indices, indices_size = locate_box_parallel(box, tree, indices, indices_size, box_index)
total_count += count
return np.ascontiguousarray(indices[:total_count, :])
@nb.njit(cache=True, parallel=PARALLEL)
def locate_boxes_chunked(
box_coords: FloatArray,
tree: CellTreeData,
n_chunks: int
):
chunks = np.array_split(box_coords, n_chunks)
indices = [np.empty((0, 2), dtype=IntDType) for _ in range(n_chunks)]
for i in nb.prange(n_chunks):
indices[i] = locate_boxes_parallel(chunks[i], tree)
# numba seems unhappy with stack or concatenate.
sizes = np.empty(n_chunks, dtype=IntDType)
total_size = 0
for i in range(n_chunks):
size = len(indices[i])
sizes[i] = size
total_size += size
ii = np.empty(total_size, dtype=IntDType)
jj = np.empty(total_size, dtype=IntDType)
start = 0
for i in range(n_chunks):
end = start + sizes[i]
ii[start: end] = indices[i][:, 0]
jj[start: end] = indices[i][:, 1]
start = end
return ii, jj
```
And calling it in `celltree.py`:
```python
def locate_boxes_new(self, bbox_coords: FloatArray) -> Tuple[IntArray, IntArray]:
bbox_coords = cast_bboxes(bbox_coords)
n_chunks = nb.get_num_threads()
return locate_boxes_chunked(bbox_coords, self.celltree_data, n_chunks)
```
A current pain point is that for parallel queries of faces/boxes/edges, the tree is traversed twice: once to count and allocate, and again to actually store results. This is primarily because we have no threadsafe list or something to append to, across threads.
A simpler scheme, however, is to chunk by the number of threads, create a (C++-like) vector for each, and append to that.
I've setup a implementation:
Then benchmarking it:
Unfortunately, results are rather disappointing: