Optimize agent vision - Githubissues

I've written a benchmark for various methods of agent vision https://gist.github.com/AndreVallestero/2f64a124efb9dbc9c061c9b54a93c7b5

I propose using the optimal solution in AnymaEngine

int* test_anglecheck_cross(Agent* agents) {
    int* results = (int*)malloc(sizeof(int) * NUM_AGENTS);

    for (int i = 0; i < NUM_AGENTS; i++) {
        Agent agent = agents[i];
        float closest_dist_sq = AGENT_RANGE * AGENT_RANGE; // Use squared distance for comparison
        int closest_idx = -1;

        // Create left and right
        float left_bound_x, left_bound_y, right_bound_x, right_bound_y;
        rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[0][0], RAYS[0][1], &left_bound_x, &left_bound_y);
        rotate_unit_vector(agent.angle_x, agent.angle_y, RAYS[3][0], RAYS[3][1], &right_bound_x, &right_bound_y);

        for (int j = 0; j < NUM_AGENTS; j++) {
            if (i == j) continue;

            // Get relative vector from agent to target
            Agent target = agents[j];
            float tx = target.x - agent.x;
            float ty = target.y - agent.y;

            // Skip if out of range, or could not be the closest
            float t_dist_sq = tx * tx + ty * ty;
            if (t_dist_sq > AGENT_RANGE * AGENT_RANGE || t_dist_sq >= closest_dist_sq) continue; 

            // Skip if to the left of the left ray
            float cross_1 = left_bound_x * ty - left_bound_y * tx;
            if (cross_1 < 0) continue;

            // Skip if to the right of the right ray
            float cross_2 = tx * right_bound_y - ty * right_bound_x;
            if (cross_2 < 0) continue;

            // Update closest
            closest_dist_sq = t_dist_sq;
            closest_idx = j;
        }

        results[i] = closest_idx;
    }
    return results;
}

On average, this should perform 10% faster than the old raycast method. The implementation will need to re-add chunking, and you'll need to use unit vectors to track orientation (which should probably be done anyways for better performance and the option of using SIMD in the future).

Here's the SIMD version. It's 3x faster than my solution above, but it'll require the entire engine to be rewritten to use an ECS so the data is all contiguous

int* test_anglecheck_cross_simd(World* world) {
    int* results = (int*)malloc(sizeof(int) * NUM_AGENTS);

    #pragma omp parallel for 
    for (int i = 0; i < NUM_AGENTS; i++) {
        int closest_idx = -1;
        float closest_dist_sq = AGENT_RANGE * AGENT_RANGE; // Use squared distance for comparison
        __m256 closest_dist_sq_v = _mm256_set1_ps(closest_dist_sq);

        // Create left and right boundary vectors
        float left_bound_x, left_bound_y, right_bound_x, right_bound_y;
        rotate_unit_vector(world->agent_angle_x[i], world->agent_angle_y[i], RAYS[0][0], RAYS[0][1], &left_bound_x, &left_bound_y);
        rotate_unit_vector(world->agent_angle_x[i], world->agent_angle_y[i], RAYS[3][0], RAYS[3][1], &right_bound_x, &right_bound_y);

        // Iterate over agents in chunks of 8 (assuming AVX which handles 8 floats at once)
        for (int j = 0; j < NUM_AGENTS; j += 8) {
            // Load target agent positions (x and y)
            __m256 target_x = _mm256_loadu_ps(&world->agent_x[j]);  // Load 8 floats starting from agents[j].x
            __m256 target_y = _mm256_loadu_ps(&world->agent_y[j]);  // Load 8 floats starting from agents[j].y

            // Compute relative vector: tx = target.x - agent.x, ty = target.y - agent.y
            __m256 tx = _mm256_sub_ps(target_x, _mm256_set1_ps(world->agent_x[i]));
            __m256 ty = _mm256_sub_ps(target_y, _mm256_set1_ps(world->agent_y[i]));

            // Compute squared distance: t_dist_sq = tx * tx + ty * ty
            __m256 t_dist_sq = _mm256_fmadd_ps(tx, tx, _mm256_mul_ps(ty, ty)); // fused multiply add

            // Mask for agents within range
            __m256 range_mask = _mm256_cmp_ps(t_dist_sq, _mm256_set1_ps(AGENT_RANGE * AGENT_RANGE), _CMP_LE_OQ);

            // Compute cross products for boundary checks
            __m256 cross_1 = _mm256_sub_ps(_mm256_mul_ps(_mm256_set1_ps(left_bound_x), ty),
                                           _mm256_mul_ps(_mm256_set1_ps(left_bound_y), tx));
            __m256 cross_2 = _mm256_sub_ps(_mm256_mul_ps(tx, _mm256_set1_ps(right_bound_y)),
                                           _mm256_mul_ps(ty, _mm256_set1_ps(right_bound_x)));

            // Mask for agents between the rays (cross_1 > 0 and cross_2 > 0)
            __m256 ray_mask = _mm256_and_ps(_mm256_cmp_ps(cross_1, _mm256_setzero_ps(), _CMP_GT_OQ),
                                            _mm256_cmp_ps(cross_2, _mm256_setzero_ps(), _CMP_GT_OQ));

            // Combine range_mask and ray_mask to get the final valid targets
            __m256 valid_mask = _mm256_and_ps(range_mask, ray_mask);

            // Store the valid_mask and t_dist_sq results to memory to extract individual elements
            float valid_mask_arr[8], t_dist_sq_arr[8];
            _mm256_storeu_ps(valid_mask_arr, valid_mask);
            _mm256_storeu_ps(t_dist_sq_arr, t_dist_sq);

            // Extract results and check closest distance
            for (int k = 0; k < 8; ++k) {
                int candidate_idx = j + k;
                if (valid_mask_arr[k] && t_dist_sq_arr[k] < closest_dist_sq && candidate_idx != i) {
                    closest_dist_sq = t_dist_sq_arr[k];
                    closest_idx = candidate_idx;
                }
            }
        }
        results[i] = closest_idx;
    }

    return results;
}

buddhaman / AnymaEngine

Optimize agent vision #1