servo / euclid

Geometry primitives (basic linear algebra) for Rust
Other
459 stars 102 forks source link

Make Box2d::contains and Box2d intersects faster #526

Closed nical closed 4 months ago

nical commented 4 months ago

After stumbling upon https://www.romainguy.dev/posts/2024/down-a-rabbit-hole/ which applies the same optimization to equivalent Kotlin code I gave it a try in euclid and it indeed makes for quite a nice speedup:

Chaining simple && conditions produces branch instructions in some cases. Replacing the logical && with bitwise & to ensure the compiler does not produce branches makes a big (70~80%) performance difference.

Generated assembly

intersects before:

.section .text.euclid::_intersects,"ax",@progbits
    .globl  euclid::_intersects
    .p2align    4, 0x90
    .type   euclid::_intersects,@function
euclid::_intersects:
    .cfi_startproc
    movss xmm0, dword ptr [rsi + 8]
    ucomiss xmm0, dword ptr [rdi]
    jbe .LBB0_4
    movss xmm0, dword ptr [rdi + 8]
    ucomiss xmm0, dword ptr [rsi]
    jbe .LBB0_4
    movss xmm0, dword ptr [rsi + 12]
    ucomiss xmm0, dword ptr [rdi + 4]
    jbe .LBB0_4
    movss xmm0, dword ptr [rdi + 12]
    ucomiss xmm0, dword ptr [rsi + 4]
    seta al
    ret
.LBB0_4:
    xor eax, eax
    ret

intersects after:

.section .text.euclid::_intersects,"ax",@progbits
    .globl  euclid::_intersects
    .p2align    4, 0x90
    .type   euclid::_intersects,@function
euclid::_intersects:
    .cfi_startproc
    movsd xmm0, qword ptr [rdi + 8]
    movhps xmm0, qword ptr [rsi + 8]
    movups xmm1, xmmword ptr [rsi]
    movhps xmm1, qword ptr [rdi]
    cmpnltps xmm1, xmm0
    movmskps eax, xmm1
    test eax, eax
    sete al
    ret

contains before:

.section .text.euclid::box2d_contains_f32,"ax",@progbits
    .globl  euclid::box2d_contains_f32
    .p2align    4, 0x90
    .type   euclid::box2d_contains_f32,@function
euclid::box2d_contains_f32:
    .cfi_startproc
    xor eax, eax
    ucomiss xmm0, dword ptr [rdi]
    jb .LBB3_4
    movss xmm2, dword ptr [rdi + 8]
    ucomiss xmm2, xmm0
    jbe .LBB3_4
    ucomiss xmm1, dword ptr [rdi + 4]
    jb .LBB3_4
    movss xmm0, dword ptr [rdi + 12]
    ucomiss xmm0, xmm1
    seta al
.LBB3_4:
    ret

contains after:

.section .text.euclid::box2d_contains_f32,"ax",@progbits
    .globl  euclid::box2d_contains_f32
    .p2align    4, 0x90
    .type   euclid::box2d_contains_f32,@function
euclid::box2d_contains_f32:

    .cfi_startproc
    ucomiss xmm1, dword ptr [rdi + 4]
    setae cl
    movss xmm2, dword ptr [rdi + 12]
    ucomiss xmm2, xmm1
    seta al
    movss xmm1, dword ptr [rdi]
    movss xmm2, dword ptr [rdi + 8]
    cmpleps xmm1, xmm0
    cmpltps xmm0, xmm2
    andps xmm0, xmm1
    movd edx, xmm0
    and al, cl
    and al, dl
    ret

Benchmark code

Results in code comments.

use criterion::{black_box, criterion_group, criterion_main, Criterion};
use euclid::default::*;

fn pseudo_rand(state: &mut f32) -> f32 {
    let seed = 7.0;
    let x = *state;
    *state += 1.0;

    f32::fract(f32::tan((x * 1.61803398874989484820459 - x).abs() * seed) * x)
}

// Before:
//   box2d intersects f32    time:   [246.45 ms 246.60 ms 246.75 ms]
// After:
//   box2d intersects f32    time:   [70.162 ms 70.189 ms 70.218 ms]                                 
//   change: [-71.558% -71.537% -71.516%] (p = 0.00 < 0.05)
pub fn box2d_intersects_f32(c: &mut Criterion) {
    let mut boxes = Vec::with_capacity(10_000);

    let mut rand_state = 1.0;
    for _ in 0..10_000 {
        let x0 = pseudo_rand(&mut rand_state) * 2000.0;
        let x1 = pseudo_rand(&mut rand_state) * 2000.0;
        let y0 = pseudo_rand(&mut rand_state) * 2000.0;
        let y1 = pseudo_rand(&mut rand_state) * 2000.0;
        boxes.push(Box2D {
            min: Point2D::new(x0.min(x1), y0.min(y1)),
            max: Point2D::new(x0.max(x1), y0.max(y1)),
        });
    }

    c.bench_function("box2d intersects f32", |b| b.iter(|| {
        for a in boxes.iter() {
            for b in boxes.iter() {
                black_box(a.intersects(b));
            }
        }
    }));
}

// Before:
//   box2d contains f32      time:   [367.26 ms 367.50 ms 367.76 ms]
// After:
//   box2d contains f32      time:   [52.898 ms 52.940 ms 52.986 ms]                               
//   change: [-85.609% -85.595% -85.579%] (p = 0.00 < 0.05)
pub fn box2d_contains_f32(c: &mut Criterion) {
    let mut boxes = Vec::with_capacity(10_000);
    let mut points = Vec::with_capacity(10_000);

    let mut rand_state = 1.0;
    for _ in 0..10_000 {
        let x0 = pseudo_rand(&mut rand_state) * 2000.0;
        let x1 = pseudo_rand(&mut rand_state) * 2000.0;
        let y0 = pseudo_rand(&mut rand_state) * 2000.0;
        let y1 = pseudo_rand(&mut rand_state) * 2000.0;
        boxes.push(Box2D {
            min: Point2D::new(x0.min(x1), y0.min(y1)),
            max: Point2D::new(x0.max(x1), y0.max(y1)),
        });
        let x2 = pseudo_rand(&mut rand_state) * 2000.0;
        let y2 = pseudo_rand(&mut rand_state) * 2000.0;
        points.push(Point2D::new(x2, y2));
    }

    c.bench_function("box2d contains f32", |b| b.iter(|| {
        for a in boxes.iter() {
            for b in points.iter() {
                black_box(a.contains(*b));
            }
        }
    }));
}

criterion_group!(box2d, box2d_intersects_f32, box2d_contains_f32);
criterion_main!(box2d);
nical commented 4 months ago

The speedup for contains_box:

Before:

box2d contains_box f32  time:   [436.69 ms 436.97 ms 437.26 ms]

After:

box2d contains_box f32  time:   [244.31 ms 244.51 ms 244.72 ms]                                   
change: [-44.102% -44.044% -43.981%] (p = 0.00 < 0.05)