Parallelization: try explicit chunking with numba prange iterating over chunks

A current pain point is that for parallel queries of faces/boxes/edges, the tree is traversed twice: once to count and allocate, and again to actually store results. This is primarily because we have no threadsafe list or something to append to, across threads.

A simpler scheme, however, is to chunk by the number of threads, create a (C++-like) vector for each, and append to that.

I've setup a implementation:

In `query.py`: ```python @nb.njit(inline="always") def locate_box_parallel(box: Box, tree: CellTreeData, indices: IntArray, indices_size: int, index: int) -> Tuple[int, int]: tree_bbox = as_box(tree.bbox) if not boxes_intersect(box, tree_bbox): return 0, indices, indices_size stack = allocate_stack() stack[0] = 0 size = 1 count = 0 while size > 0: node_index, size = pop(stack, size) node = tree.nodes[node_index] # Check if it's a leaf if node["child"] == -1: # Iterate over the bboxes in the leaf for i in range(node["ptr"], node["ptr"] + node["size"]): bbox_index = tree.bb_indices[i] # As a named tuple: saves about 15% runtime leaf_box = as_box(tree.bb_coords[bbox_index]) if boxes_intersect(box, leaf_box): # Re-allocate if needed. indices_length = indices.shape[0] if indices_size >= indices_length: new = np.empty((indices_length * 2, 2), dtype=IntDType) new[: indices_length, :] = indices[:, :] indices = new indices[indices_size, 0] = index indices[indices_size, 1] = bbox_index indices_size += 1 count += 1 else: dim = 1 if node["dim"] else 0 minimum = 2 * dim maximum = 2 * dim + 1 left = box[minimum] <= node["Lmax"] right = box[maximum] >= node["Rmin"] left_child = node["child"] right_child = left_child + 1 if left and right: stack, size = push(stack, left_child, size) stack, size = push(stack, right_child, size) elif left: stack, size = push(stack, left_child, size) elif right: stack, size = push(stack, right_child, size) return count, indices, indices_size @nb.njit(cache=True) def locate_boxes_parallel( box_coords: FloatArray, tree: CellTreeData, ) -> IntArray: """Mutates box_indices and tree_indices.""" n_box = box_coords.shape[0] indices = np.empty((n_box, 2), dtype=IntDType) total_count = 0 indices_size = 0 for box_index in range(n_box): box = as_box(box_coords[box_index]) count, indices, indices_size = locate_box_parallel(box, tree, indices, indices_size, box_index) total_count += count return np.ascontiguousarray(indices[:total_count, :]) @nb.njit(cache=True, parallel=PARALLEL) def locate_boxes_chunked( box_coords: FloatArray, tree: CellTreeData, n_chunks: int ): chunks = np.array_split(box_coords, n_chunks) indices = [np.empty((0, 2), dtype=IntDType) for _ in range(n_chunks)] for i in nb.prange(n_chunks): indices[i] = locate_boxes_parallel(chunks[i], tree) # numba seems unhappy with stack or concatenate. sizes = np.empty(n_chunks, dtype=IntDType) total_size = 0 for i in range(n_chunks): size = len(indices[i]) sizes[i] = size total_size += size ii = np.empty(total_size, dtype=IntDType) jj = np.empty(total_size, dtype=IntDType) start = 0 for i in range(n_chunks): end = start + sizes[i] ii[start: end] = indices[i][:, 0] jj[start: end] = indices[i][:, 1] start = end return ii, jj ``` And calling it in `celltree.py`: ```python def locate_boxes_new(self, bbox_coords: FloatArray) -> Tuple[IntArray, IntArray]: bbox_coords = cast_bboxes(bbox_coords) n_chunks = nb.get_num_threads() return locate_boxes_chunked(bbox_coords, self.celltree_data, n_chunks) ```

Then benchmarking it:

import os
# os.environ["NUMBA_NUM_THREADS"] = "24"

import numpy as np
import numba_celltree

triangles = np.fromfile("c:/tmp/triangles.bin", dtype=np.int64).reshape((-1, 3))
vertices = np.fromfile("c:/tmp/vertices.bin", dtype=np.float64).reshape((-1, 2))

tree = numba_celltree.CellTree2d(vertices, triangles, -1)
%timeit results = tree.locate_boxes(tree.bb_coords)
%timeit results = tree.locate_boxes_new(tree.bb_coords)

Unfortunately, results are rather disappointing:

404 ms ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
470 ms ± 12.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Deltares / numba_celltree

Parallelization: try explicit chunking with numba prange iterating over chunks #24