Fix coalescing of pcmpestri/pcmpestrm instructions

ghost commented 6 years ago

I used cargo-asm to check the generated assembly for the following program:

#[macro_use] extern crate jetscii;

fn main() {
    let part_number = "                                                         86-J52:rev1";
    let first = ascii_chars!('-', ':').find(part_number);
    assert_eq!(first, Some(2));
}

After making PackedCompare<T>::cmpestri and PackedCompare<T>::cmpestrm inline never so that viewing the generated assembly would be easier, I found that the two pcmpestri/pcmpestrm instructions (one to check if a match was found, and the other to handle the found index) were not coalesced.

cargo asm '<jetscii::simd::PackedCompare<T>>::cmpestri' --rust generated:

<jetscii::simd::PackedCompare<T>>::cmpestri (/Users/dtrebbien/Documents/Projects/jetscii/src/simd.rs:148):
 push    rbp
 mov     rbp, rsp
 push    r15
 push    r14
 push    rbx
 sub     rsp, 56
 mov     r14d, edx
 mov     rbx, rdi
 movups  xmm0, xmmword, ptr, [rsi]
 movaps  xmmword, ptr, [rbp, -, 80], xmm0
 lea     rdi, [rbp, -, 64]
 mov     rsi, rbx
 call    <&'b jetscii::simd::Bytes as jetscii::simd::PackedCompareControl>::needle
 movaps  xmm0, xmmword, ptr, [rbp, -, 64]
 movaps  xmmword, ptr, [rbp, -, 48], xmm0
 mov     rdi, rbx
 call    <&'b jetscii::simd::Bytes as jetscii::simd::PackedCompareControl>::needle_len
 xor     r15d, r15d
 mov     edx, r14d
 movdqa  xmm0, xmmword, ptr, [rbp, -, 48]
 pcmpestri xmm0, xmmword, ptr, [rbp, -, 80], 0
 setb    r15b
 lea     rdi, [rbp, -, 64]
 mov     rsi, rbx
 call    <&'b jetscii::simd::Bytes as jetscii::simd::PackedCompareControl>::needle
 movaps  xmm0, xmmword, ptr, [rbp, -, 64]
 movaps  xmmword, ptr, [rbp, -, 48], xmm0
 mov     rdi, rbx
 call    <&'b jetscii::simd::Bytes as jetscii::simd::PackedCompareControl>::needle_len
 mov     edx, r14d
 movdqa  xmm0, xmmword, ptr, [rbp, -, 48]
 pcmpestri xmm0, xmmword, ptr, [rbp, -, 80], 0
 movsxd  rdx, ecx
 mov     rax, r15
 add     rsp, 56
 pop     rbx
 pop     r14
 pop     r15
 pop     rbp
 ret

cargo asm '<jetscii::simd::PackedCompare<T>>::cmpestrm' --rust generated:

<jetscii::simd::PackedCompare<T>>::cmpestrm (/Users/dtrebbien/Documents/Projects/jetscii/src/simd.rs:101):
 push    rbp
 mov     rbp, rsp
 push    r15
 push    r14
 push    rbx
 sub     rsp, 72
 mov     r14, rdx
 mov     rbx, rdi
 movups  xmm0, xmmword, ptr, [rsi]
 movaps  xmmword, ptr, [rbp, -, 96], xmm0
 lea     rdi, [rbp, -, 48]
 mov     rsi, rbx
 call    <&'b jetscii::simd::Bytes as jetscii::simd::PackedCompareControl>::needle
 movaps  xmm0, xmmword, ptr, [rbp, -, 48]
 movaps  xmmword, ptr, [rbp, -, 80], xmm0
 mov     rdi, rbx
 call    <&'b jetscii::simd::Bytes as jetscii::simd::PackedCompareControl>::needle_len
 mov     r15d, eax
 lea     rdi, [rbp, -, 48]
 mov     rsi, rbx
 call    <&'b jetscii::simd::Bytes as jetscii::simd::PackedCompareControl>::needle
 movaps  xmm0, xmmword, ptr, [rbp, -, 48]
 movaps  xmmword, ptr, [rbp, -, 64], xmm0
 mov     rdi, rbx
 call    <&'b jetscii::simd::Bytes as jetscii::simd::PackedCompareControl>::needle_len
 movdqa  xmm0, xmmword, ptr, [rbp, -, 96]
 mov     esi, eax
 mov     edx, 16
 mov     eax, r15d
 movdqa  xmm1, xmmword, ptr, [rbp, -, 80]
 pcmpestri xmm1, xmm0, 0
 jae     LBB6_1
 mov     edx, 16
 mov     eax, esi
 movdqa  xmm1, xmmword, ptr, [rbp, -, 64]
 pcmpestrm xmm1, xmm0, 0
 movq    rax, xmm0
 mov     ecx, r14d
 shr     rax, cl
 test    rax, rax
 je      LBB6_1
 je      LBB6_4
 bsf     rdx, rax
 jmp     LBB6_6
LBB6_1:
 xor     eax, eax
 jmp     LBB6_7
LBB6_4:
 mov     edx, 64
LBB6_6:
 mov     eax, 1
LBB6_7:
 add     rsp, 72
 pop     rbx
 pop     r14
 pop     r15
 pop     rbp
 ret

With this PR applied, one additional pcmpestri instruction is eliminated:

<jetscii::simd::PackedCompare<T>>::cmpestri (/Users/dtrebbien/Documents/Projects/jetscii/src/simd.rs:139):
 push    rbp
 mov     rbp, rsp
 push    r14
 push    rbx
 sub     rsp, 48
 mov     r14d, edx
 mov     rbx, rdi
 movups  xmm0, xmmword, ptr, [rsi]
 movaps  xmmword, ptr, [rbp, -, 48], xmm0
 lea     rdi, [rbp, -, 64]
 mov     rsi, rbx
 call    <&'b jetscii::simd::Bytes as jetscii::simd::PackedCompareControl>::needle
 movaps  xmm0, xmmword, ptr, [rbp, -, 64]
 movaps  xmmword, ptr, [rbp, -, 32], xmm0
 mov     rdi, rbx
 call    <&'b jetscii::simd::Bytes as jetscii::simd::PackedCompareControl>::needle_len
 mov     edx, r14d
 movdqa  xmm0, xmmword, ptr, [rbp, -, 32]
 pcmpestri xmm0, xmmword, ptr, [rbp, -, 48], 0
 xor     eax, eax
 cmp     ecx, 16
 setl    al
 movsxd  rdx, ecx
 add     rsp, 48
 pop     rbx
 pop     r14
 pop     rbp
 ret

<jetscii::simd::PackedCompare<T>>::cmpestrm (/Users/dtrebbien/Documents/Projects/jetscii/src/simd.rs:100):
 push    rbp
 mov     rbp, rsp
 push    r14
 push    rbx
 sub     rsp, 48
 mov     r14, rdx
 mov     rbx, rdi
 movups  xmm0, xmmword, ptr, [rsi]
 movaps  xmmword, ptr, [rbp, -, 48], xmm0
 lea     rdi, [rbp, -, 64]
 mov     rsi, rbx
 call    <&'b jetscii::simd::Bytes as jetscii::simd::PackedCompareControl>::needle
 movaps  xmm0, xmmword, ptr, [rbp, -, 64]
 movaps  xmmword, ptr, [rbp, -, 32], xmm0
 mov     rdi, rbx
 call    <&'b jetscii::simd::Bytes as jetscii::simd::PackedCompareControl>::needle_len
 mov     edx, 16
 movdqa  xmm0, xmmword, ptr, [rbp, -, 32]
 pcmpestrm xmm0, xmmword, ptr, [rbp, -, 48], 0
 movd    eax, xmm0
 test    ax, ax
 je      LBB6_1
 bsf     cx, ax
 movzx   ecx, cx
 cmp     ecx, 15
 jbe     LBB6_5
 jmp     LBB6_4
LBB6_1:
 mov     cx, 16
 movzx   ecx, cx
 cmp     ecx, 15
 ja      LBB6_4
LBB6_5:
 movzx   eax, ax
 and     r14b, 15
 mov     ecx, r14d
 shr     eax, cl
 test    ax, ax
 je      LBB6_4
 je      LBB6_7
 bsf     ax, ax
 jmp     LBB6_9
LBB6_4:
 xor     eax, eax
 jmp     LBB6_10
LBB6_7:
 mov     ax, 16
LBB6_9:
 movzx   edx, ax
 mov     eax, 1
LBB6_10:
 add     rsp, 48
 pop     rbx
 pop     r14
 pop     rbp
 ret

I ran the benchmarks using rustc 1.30.0-nightly (73c78734b 2018-08-05) on macOS 10.13.6 with a 2.3 GHz Intel Core i7, and the Jetscii benchmarks improve slightly:

Before:

test bench::space_ascii_chars ... bench: 793,830 ns/iter (+/- 38,661) = 6604 MB/s
test bench::space_stdlib_find_char ... bench: 537,295 ns/iter (+/- 119,061) = 9757 MB/s
test bench::space_stdlib_find_char_set ... bench: 7,074,271 ns/iter (+/- 163,336) = 741 MB/s
test bench::space_stdlib_find_closure ... bench: 7,105,007 ns/iter (+/- 354,860) = 737 MB/s
test bench::space_stdlib_find_string ... bench: 3,190,072 ns/iter (+/- 108,691) = 1643 MB/s
test bench::space_stdlib_iterator_position ... bench: 3,162,603 ns/iter (+/- 75,241) = 1657 MB/s
test bench::substring_stdlib_find ... bench: 989,340 ns/iter (+/- 58,210) = 5299 MB/s
test bench::substring_with_created_searcher ... bench: 799,166 ns/iter (+/- 38,097) = 6560 MB/s
test bench::xml_delim_3_ascii_chars ... bench: 795,045 ns/iter (+/- 38,551) = 6594 MB/s
test bench::xml_delim_3_stdlib_find_char_closure ... bench: 7,080,469 ns/iter (+/- 173,507) = 740 MB/s
test bench::xml_delim_3_stdlib_find_char_set ... bench: 7,351,264 ns/iter (+/- 264,844) = 713 MB/s
test bench::xml_delim_3_stdlib_iterator_position ... bench: 4,337,162 ns/iter (+/- 139,146) = 1208 MB/s
test bench::xml_delim_5_ascii_chars ... bench: 804,849 ns/iter (+/- 62,880) = 6514 MB/s
test bench::xml_delim_5_stdlib_find_char_closure ... bench: 7,058,137 ns/iter (+/- 158,827) = 742 MB/s
test bench::xml_delim_5_stdlib_find_char_set ... bench: 12,977,299 ns/iter (+/- 1,896,773) = 404 MB/s
test bench::xml_delim_5_stdlib_iterator_position ... bench: 8,827,361 ns/iter (+/- 297,306) = 593 MB/s

After:

test bench::space_ascii_chars ... bench: 777,229 ns/iter (+/- 35,709) = 6745 MB/s
test bench::space_stdlib_find_char ... bench: 548,953 ns/iter (+/- 123,592) = 9550 MB/s
test bench::space_stdlib_find_char_set ... bench: 7,080,518 ns/iter (+/- 226,482) = 740 MB/s
test bench::space_stdlib_find_closure ... bench: 7,049,567 ns/iter (+/- 181,421) = 743 MB/s
test bench::space_stdlib_find_string ... bench: 3,174,222 ns/iter (+/- 103,375) = 1651 MB/s
test bench::space_stdlib_iterator_position ... bench: 3,154,285 ns/iter (+/- 85,194) = 1662 MB/s
test bench::substring_stdlib_find ... bench: 973,902 ns/iter (+/- 67,721) = 5383 MB/s
test bench::substring_with_created_searcher ... bench: 792,693 ns/iter (+/- 38,922) = 6614 MB/s
test bench::xml_delim_3_ascii_chars ... bench: 794,039 ns/iter (+/- 31,857) = 6602 MB/s
test bench::xml_delim_3_stdlib_find_char_closure ... bench: 7,044,328 ns/iter (+/- 176,813) = 744 MB/s
test bench::xml_delim_3_stdlib_find_char_set ... bench: 7,125,839 ns/iter (+/- 345,773) = 735 MB/s
test bench::xml_delim_3_stdlib_iterator_position ... bench: 4,360,698 ns/iter (+/- 359,530) = 1202 MB/s
test bench::xml_delim_5_ascii_chars ... bench: 794,298 ns/iter (+/- 39,496) = 6600 MB/s
test bench::xml_delim_5_stdlib_find_char_closure ... bench: 7,054,105 ns/iter (+/- 157,787) = 743 MB/s
test bench::xml_delim_5_stdlib_find_char_set ... bench: 13,108,595 ns/iter (+/- 885,809) = 399 MB/s
test bench::xml_delim_5_stdlib_iterator_position ... bench: 8,662,344 ns/iter (+/- 192,292) = 605 MB/s

shepmaster commented 6 years ago

Thanks! This certainly seems better, so I'm definitely going to accept this. However, I swear I checked this (comments to the opposite notwithstanding...). I definitely had some discussion about it.

When executing the PCMPxSTRx instruction, it already sets various CPU flags indicating extra outputs. Ideally, these flags are what the if statement would be using, which should avoid even doing a second comparison.

Any thoughts on such a path?

ghost commented 6 years ago

As PackedCompare<T>::cmpestri seems pretty much the same to me in the "before" vs. "after" assembly (except for one fewer PCMPESTRI instruction), I think that it's best to focus on PackedCompare<T>::cmpestrm, because that's where I see an unfortunate side effect of this change.

As I understand it, the PCMPESTRM instruction sets the CFlag to 0 if there wasn't a match, and the _mm_cmpestrc() intrinsic represents the value of the CFlag after a PCMPESTRM instruction. We can see this in the "before" assembly of PackedCompare<T>::cmpestrm; this code:

        let found = _mm_cmpestrc(
            self.0.needle(),
            self.0.needle_len(),
            haystack,
            BYTES_PER_OPERATION as i32,
            T::CONTROL_BYTE,
        );
        //...

        if found != 0 {
            //...
        } else {
            None
        }

.. was compiled to:

 pcmpestri xmm1, xmm0, 0 ; this computes `found` by way of the CFlag
 jae     LBB6_1
 ;...
LBB6_1:
 ;...

The JAE instruction causes control flow to jump over the second PCMPESTRM instruction (which computes mask) in the event that CFlag is 0.

In the "after" assembly, the PCMPESTRI & JAE instructions became:

 pcmpestrm xmm0, xmmword, ptr, [rbp, -, 48], 0
 movd    eax, xmm0
 test    ax, ax
 je      LBB6_1

In the event that there wasn't a match, the MOVD instruction is somewhat wasteful because it copies data from an XMM register to EAX, as opposed to just checking the CFlag. If there were a way to access the CFlag and output XMM register of the PCMPESTRM instruction through an instrinsic, that would be ideal.

In trying to find out how to access the EFLAGS register in Rust stdsimd, I found rust-lang-nursery/stdsimd#485, which concerns the now-deprecated __readeflags() and __writeeflags() intrinsics. @rkruppe makes the cogent point that it's possible for LLVM to move code before the PUSHFQ & POPQ instructions generated by a __readeflags() intrinsic; therefore, LLVM might move EFLAGS-clobbering code between the PCMPESTRM instruction generated by an _mm_cmpestrm() instrinsic and the PUSHFQ & POPQ instructions generated by a __readeflags() intrinsic.

However, even if __readeflags() were safe to use, I think that the PUSHFQ & POPQ instructions are also less than ideal.

I think that the only way to get a PCMPESTRM instruction immediately followed by a JAE instruction is to use inline assembly, but this is not allowed on Rust stable.

Fortunately, it looks like PackedCompare<T>::cmpestrm is called at most once per find operation, and only when the haystack is not aligned.

ghost commented 6 years ago

By the way, with respect to the re-licensing issue (#12), I hereby license this contribution under the dual MIT/Apache-2.0 license, allowing licensees to choose either at their option.

shepmaster commented 6 years ago

By the way, with respect to the re-licensing issue (#12),

Haha, I had totally forgotten about that. Thank you for reminding me 😅

shepmaster commented 6 years ago

Thank you for that wonderfully deep exploration!

get a PCMPESTRM instruction immediately followed by a JAE instruction is to use inline assembly

The only reason I know about this is because that's what the code used to do, but your benchmarks show that this PR is definitely faster than what we have now, so forward!

shepmaster commented 6 years ago

I wonder if it is worth raising a stdsimd issue that back-to-back calls of the same instruction weren't coalesced...

shepmaster commented 6 years ago

I'm going to ping the people on the relicense issue to try and update that for the next release, but if I don't hear from them in a few days or so, I'll release a new version with the current license. Please feel free to poke me if I forget (a common occurrence)!

ghost commented 6 years ago

Thank you, Jake.

I wonder if it is worth raising a stdsimd issue that back-to-back calls of the same instruction weren't coalesced...

I think so. I'll try to put together a minimal test case with the output from rustc stable and nightly, as well as what Clang and maybe GCC do in similar code.

ghost commented 6 years ago

Okay, so here's what I found:

I prepared a Rust test case:

// rustc -C opt-level=3 -o main_rustc main.rs && ./main_rustc
// rustc -C opt-level=3 --emit asm -o main_rustc.s main.rs

use std::arch::x86_64::*;

union XmmTransmute {
    epi8: [i8; 16],
    xmm: __m128i,
}

fn main() {
    let haystack = "the quick brown fox jumps over a lazy dog\u{0}";

    unsafe {
        let haystack = haystack.get_unchecked(..16);
        println!("haystack = '{}'", haystack);

        let needle_xmm = XmmTransmute { epi8: ['a' as i8, 'e' as i8, 'i' as i8, 'o' as i8, 'u' as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] }.xmm;
        let haystack_xmm = _mm_loadu_si128(haystack.as_ptr() as *const __m128i);

        println!("\nTrying PCMPESTRM...");
        let found = _mm_cmpestrc(needle_xmm, 5, haystack_xmm, 16, 0);
        let mask_xmm = _mm_cmpestrm(needle_xmm, 5, haystack_xmm, 16, 0);
        if found != 0 {
            let mask = _mm_extract_epi32(mask_xmm, 0) as u16;
            println!("PCMPESTRM found {} vowel(s). mask = 0b{:016b}", mask.count_ones(), mask);
        } else {
            println!("No vowel was found by PCMPESTRM.");
        }

        println!("\nTrying PCMPISTRM...");
        let found = _mm_cmpistrc(needle_xmm, haystack_xmm, 0);
        let mask_xmm = _mm_cmpistrm(needle_xmm, haystack_xmm, 0);
        if found != 0 {
            let mask = _mm_extract_epi32(mask_xmm, 0) as u16;
            println!("PCMPISTRM found {} vowel(s). mask = 0b{:016b}", mask.count_ones(), mask);
        } else {
            println!("No vowel was found by PCMPISTRM.");
        }
    }
}

.. and a functionally equivalent C++ version:

// g++-8 -msse4 -O2 -o main_gcc8 main.cpp && ./main_gcc8
// g++-8 -msse4 -O2 -S -o main_gcc8.s main.cpp

// export PATH=$(brew --prefix)/opt/llvm/bin:$PATH
// clang++ -msse4 -O2 -o main_clang6 main.cpp && ./main_clang6
// clang++ -msse4 -O2 -S -o main_clang6.s main.cpp

#include <bitset>
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <string>

#include <smmintrin.h>

static const char needle_bytes[16] __attribute__((aligned(16))) = { 'a', 'e', 'i', 'o', 'u', 0 };

int main() {
    std::string haystack = "the quick brown ";
    assert(haystack.length() == 16);
    std::cout << "haystack = '" << haystack << "'\n";

    __m128i needle_xmm = _mm_load_si128((const __m128i *) &needle_bytes[0]);
    __m128i haystack_xmm = _mm_loadu_si128((const __m128i *) haystack.data());

    int found;
    __m128i mask_xmm;

    std::cout << "\nTrying PCMPESTRM...\n";
    found = _mm_cmpestrc(needle_xmm, 5, haystack_xmm, 16, 0);
    mask_xmm = _mm_cmpestrm(needle_xmm, 5, haystack_xmm, 16, 0);
    if (found != 0) {
        int mask = _mm_extract_epi32(mask_xmm, 0);
        std::cout << "PCMPESTRM found " << _mm_popcnt_u32(mask) << " vowel(s). mask = 0b" << std::bitset<16>(mask) << '\n';
    } else {
        std::cout << "No vowel was found by PCMPESTRM.\n";
    }

    std::cout << "\nTrying PCMPISTRM...\n";
    found = _mm_cmpistrc(needle_xmm, haystack_xmm, 0);
    mask_xmm = _mm_cmpistrm(needle_xmm, haystack_xmm, 0);
    if (found != 0) {
        int mask = _mm_extract_epi32(mask_xmm, 0);
        std::cout << "PCMPISTRM found " << _mm_popcnt_u32(mask) << " vowel(s). mask = 0b" << std::bitset<16>(mask) << '\n';
    } else {
        std::cout << "No vowel was found by PCMPISTRM.\n";
    }

    return EXIT_SUCCESS;
}

GCC 8.2.0 was able to coalesce the PCMPxSTRx instructions; it turned the _mm_cmpestrc()/_mm_cmpestrm() intrinsic calls to:

    pcmpestrm   $0, %xmm1, %xmm2
    jnc L27

.. and the _mm_cmpistrc()/_mm_cmpistrm() intrinsic calls to:

    pcmpistrm   $0, 16(%rsp), %xmm4
    jnc L29

I was not able to get rustc to inline the intrinsics in the assembly. However, I found that LLVM 6.0.1 does not coalesce the intrinsics. E.g.:

    pcmpestri   $0, %xmm0, %xmm1
    jae LBB0_18
## %bb.10:
    movl    $5, %eax
    movl    $16, %edx
    pcmpestrm   $0, %xmm0, %xmm1

I also found LLVM Issue 37246 - Poor lowering of SSE4.2 string intrinsics, so this is apparently a known issue.

Because the instructions are not coalesced by LLVM, I am not sure what Rust stdsimd can do.

shepmaster commented 6 years ago

Because the instructions are not coalesced by LLVM, I am not sure what Rust stdsimd can do.

It still seems worth filing this information in rust-lang/rust or stdsimd. People are unlikely to look here for such information, and it does seem like a "bug" in that the nice code isn't as fast as we'd expect.

shepmaster commented 6 years ago

I was not able to get rustc to inline the instrinsics in the assembly

You may want to try with -C target-cpu=native.

ghost commented 6 years ago

You may want to try with -C target-cpu=native.

Yes! That worked!

Here is the assembly generated by rustc 1.28.0 (9634041f0 2018-07-30):

    vpcmpestri  $0, %xmm1, %xmm0
    jae LBB5_2
    movl    $5, %eax
    movl    $16, %edx
    vpcmpestrm  $0, %xmm1, %xmm0

shepmaster commented 6 years ago

Yes! That worked!

By default, Rust will target a very compatible CPU, inserting shims that perform a one-time runtime check for the current CPU capabilities. This is why I've structured Jetscii the way I have, with the magic attribute on every function in the simd module and a little compile-time check to see if we need the runtime check.

ghost commented 6 years ago

I just created rust-lang/rust#53232

shepmaster / jetscii

Fix coalescing of pcmpestri/pcmpestrm instructions #24