llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
27.56k stars 11.32k forks source link

x86 SSE4.2 suboptimal codegen: `_mm_cmpestrc` is not fused with `_mm_cmpestri` or `_mm_cmpestrm` #96463

Open AlexGuteniev opened 1 month ago

AlexGuteniev commented 1 month ago

The following C++ code:

#include <immintrin.h>

int test1(__m128i needle, __m128i haystack) {
    if (_mm_cmpestrc(needle, 2, haystack, 8, _SIDD_CMP_EQUAL_ORDERED | _SIDD_BIT_MASK))
    {
        int bitmask = _mm_cmpestri(needle, 2, haystack, 8, _SIDD_CMP_EQUAL_ORDERED | _SIDD_BIT_MASK);
        return bitmask;
    }
    else
    {
        return 0;
    }
} 

int test2(__m128i needle, __m128i haystack) {
    if (_mm_cmpestrc(needle, 2, haystack, 8, _SIDD_CMP_EQUAL_ORDERED | _SIDD_BIT_MASK))
    {
        int bitmask = _mm_cvtsi128_si32(_mm_cmpestrm(needle, 2, haystack, 8, _SIDD_CMP_EQUAL_ORDERED | _SIDD_BIT_MASK));
        return bitmask;
    }
    else
    {
        return 0;
    }
} 

Produces two pcmpestri instructions for test1 and pcmpestri followed by pcmpestrm for test2`, Demo: https://godbolt.org/z/GE783h8dz

I expect that _mm_cmpestrc is fused with the other intrinsic, so there's one pcmpestri in test1 and one pcmpestrm for test2. (MSVC does this susion for test1, and gcc does this fusion for both test1 and test2)

llvmbot commented 1 month ago

@llvm/issue-subscribers-backend-x86

Author: Alex Guteniev (AlexGuteniev)

The following C++ code: ```c++ #include <immintrin.h> int test1(__m128i needle, __m128i haystack) { if (_mm_cmpestrc(needle, 2, haystack, 8, _SIDD_CMP_EQUAL_ORDERED | _SIDD_BIT_MASK)) { int bitmask = _mm_cmpestri(needle, 2, haystack, 8, _SIDD_CMP_EQUAL_ORDERED | _SIDD_BIT_MASK); return bitmask; } else { return 0; } } int test2(__m128i needle, __m128i haystack) { if (_mm_cmpestrc(needle, 2, haystack, 8, _SIDD_CMP_EQUAL_ORDERED | _SIDD_BIT_MASK)) { int bitmask = _mm_cvtsi128_si32(_mm_cmpestrm(needle, 2, haystack, 8, _SIDD_CMP_EQUAL_ORDERED | _SIDD_BIT_MASK)); return bitmask; } else { return 0; } } ``` Produces two `pcmpestri` instructions for `test1` and `pcmpestri` followed by `pcmpestrm` for test2`, Demo: https://godbolt.org/z/GE783h8dz I expect that `_mm_cmpestrc` is fused with the other intrinsic, so there's one `pcmpestri` in `test1` and one `pcmpestrm` for test2. (MSVC does this susion for `test1`, and gcc does this fusion for both `test1` and `test2`)