llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
26.85k stars 11.01k forks source link

[AVX512] Replace x86_avx512_mask_pmov_* register intrinsics with generic expansion #97694

Open RKSimon opened 6 days ago

RKSimon commented 6 days ago

We already expand basic truncation intrinsics to the trunc+shuffle sequence:

static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_cvtepi64_epi8 (__m128i __A)
{
  return (__m128i)__builtin_shufflevector(
      __builtin_convertvector((__v2di)__A, __v2qi), (__v2qi){0, 0}, 0, 1, 2, 3,
      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
}

But the predicate variants are not, as its tricky to correctly merge the predicate select of the lower elements and the zeroing of the upper elements:

static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
{
  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
              (__v16qi) __O, __M);
}

static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A)
{
  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
              (__v16qi) _mm_setzero_si128 (),
              __M);
}

So this is likely to require some improvements to the DAG backend as well as clang frontend

llvmbot commented 6 days ago

@llvm/issue-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

We already expand basic truncation intrinsics to the trunc+shuffle sequence: ```cpp static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtepi64_epi8 (__m128i __A) { return (__m128i)__builtin_shufflevector( __builtin_convertvector((__v2di)__A, __v2qi), (__v2qi){0, 0}, 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3); } ``` But the predicate variants are not, as its tricky to correctly merge the predicate select of the lower elements and the zeroing of the upper elements: ``` static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, (__v16qi) __O, __M); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, (__v16qi) _mm_setzero_si128 (), __M); } ``` So this is likely to require some improvements to the DAG backend as well as clang frontend