Open AndreVallestero opened 1 month ago
Here's the SIMD version. It's 3x faster than my solution above, but it'll require the entire engine to be rewritten to use an ECS so the data is all contiguous
int* test_anglecheck_cross_simd(World* world) {
int* results = (int*)malloc(sizeof(int) * NUM_AGENTS);
#pragma omp parallel for
for (int i = 0; i < NUM_AGENTS; i++) {
int closest_idx = -1;
float closest_dist_sq = AGENT_RANGE * AGENT_RANGE; // Use squared distance for comparison
__m256 closest_dist_sq_v = _mm256_set1_ps(closest_dist_sq);
// Create left and right boundary vectors
float left_bound_x, left_bound_y, right_bound_x, right_bound_y;
rotate_unit_vector(world->agent_angle_x[i], world->agent_angle_y[i], RAYS[0][0], RAYS[0][1], &left_bound_x, &left_bound_y);
rotate_unit_vector(world->agent_angle_x[i], world->agent_angle_y[i], RAYS[3][0], RAYS[3][1], &right_bound_x, &right_bound_y);
// Iterate over agents in chunks of 8 (assuming AVX which handles 8 floats at once)
for (int j = 0; j < NUM_AGENTS; j += 8) {
// Load target agent positions (x and y)
__m256 target_x = _mm256_loadu_ps(&world->agent_x[j]); // Load 8 floats starting from agents[j].x
__m256 target_y = _mm256_loadu_ps(&world->agent_y[j]); // Load 8 floats starting from agents[j].y
// Compute relative vector: tx = target.x - agent.x, ty = target.y - agent.y
__m256 tx = _mm256_sub_ps(target_x, _mm256_set1_ps(world->agent_x[i]));
__m256 ty = _mm256_sub_ps(target_y, _mm256_set1_ps(world->agent_y[i]));
// Compute squared distance: t_dist_sq = tx * tx + ty * ty
__m256 t_dist_sq = _mm256_fmadd_ps(tx, tx, _mm256_mul_ps(ty, ty)); // fused multiply add
// Mask for agents within range
__m256 range_mask = _mm256_cmp_ps(t_dist_sq, _mm256_set1_ps(AGENT_RANGE * AGENT_RANGE), _CMP_LE_OQ);
// Compute cross products for boundary checks
__m256 cross_1 = _mm256_sub_ps(_mm256_mul_ps(_mm256_set1_ps(left_bound_x), ty),
_mm256_mul_ps(_mm256_set1_ps(left_bound_y), tx));
__m256 cross_2 = _mm256_sub_ps(_mm256_mul_ps(tx, _mm256_set1_ps(right_bound_y)),
_mm256_mul_ps(ty, _mm256_set1_ps(right_bound_x)));
// Mask for agents between the rays (cross_1 > 0 and cross_2 > 0)
__m256 ray_mask = _mm256_and_ps(_mm256_cmp_ps(cross_1, _mm256_setzero_ps(), _CMP_GT_OQ),
_mm256_cmp_ps(cross_2, _mm256_setzero_ps(), _CMP_GT_OQ));
// Combine range_mask and ray_mask to get the final valid targets
__m256 valid_mask = _mm256_and_ps(range_mask, ray_mask);
// Store the valid_mask and t_dist_sq results to memory to extract individual elements
float valid_mask_arr[8], t_dist_sq_arr[8];
_mm256_storeu_ps(valid_mask_arr, valid_mask);
_mm256_storeu_ps(t_dist_sq_arr, t_dist_sq);
// Extract results and check closest distance
for (int k = 0; k < 8; ++k) {
int candidate_idx = j + k;
if (valid_mask_arr[k] && t_dist_sq_arr[k] < closest_dist_sq && candidate_idx != i) {
closest_dist_sq = t_dist_sq_arr[k];
closest_idx = candidate_idx;
}
}
}
results[i] = closest_idx;
}
return results;
}
I've written a benchmark for various methods of agent vision https://gist.github.com/AndreVallestero/2f64a124efb9dbc9c061c9b54a93c7b5
I propose using the optimal solution in AnymaEngine
On average, this should perform 10% faster than the old raycast method. The implementation will need to re-add chunking, and you'll need to use unit vectors to track orientation (which should probably be done anyways for better performance and the option of using SIMD in the future).