ReduceMax_4xU8 test program fails on s390x when compiled with Clang 18.1.6

johnplatts commented 6 months ago

Here is a C test program that fails to execute correctly when compiled for the s390x-linux-gnu target (with the -O2 -mzvector options) with Clang 18.1.6, but passes when compiled for the s390x-linux-gnu target (with the -O2 -mzvector options) with GCC 12.3.0:

#include <stdio.h>
#include <stdlib.h>

#pragma push_macro("vector")
#pragma push_macro("pixel")
#pragma push_macro("bool")

#undef vector
#undef pixel
#undef bool

#if defined(__s390x__)
#include <vecintrin.h>
#else
#include <altivec.h>
#endif

#pragma pop_macro("vector")
#pragma pop_macro("pixel")
#pragma pop_macro("bool")

#define ALTIVEC_TEST_ASSERT(cond)                                       \
  do {                                                                  \
    if (!(cond)) {                                                      \
      printf("Assertion " #cond " failed at line %d of %s\n", __LINE__, \
             __FILE__);                                                 \
      abort();                                                          \
    }                                                                   \
  } while (false)

#define ALTIVEC_TEST_UNLIKELY(cond) (!!__builtin_expect(cond, 0))

static inline __attribute__((__always_inline__)) __vector unsigned short
PromoteU8ToU16(__vector unsigned char v) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  const __vector unsigned char a = v;
  const __vector unsigned char b = vec_splats((unsigned char)0);
#else
  const __vector unsigned char a = vec_splats((unsigned char)0);
  const __vector unsigned char b = v;
#endif

  return (__vector unsigned short)vec_mergeh(a, b);
}

static inline __attribute__((__always_inline__)) unsigned short ReduceMax_4xU16(
    __vector unsigned short v3210) {
  const __vector unsigned char kReverse4Shuffle = {
      6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9};
  const __vector unsigned short v0123 =
      vec_perm(v3210, v3210, kReverse4Shuffle);
  const __vector unsigned short v03_12_12_03 = vec_max(v3210, v0123);
  const __vector unsigned short v12_03_03_12 = (__vector unsigned short)vec_rl(
      (__vector unsigned int)v03_12_12_03, vec_splats(16u));
  const __vector unsigned short max_val = vec_max(v03_12_12_03, v12_03_03_12);
  return max_val[0];
}

static inline __attribute__((__always_inline__)) unsigned char ReduceMax_4xU8(
    __vector unsigned char v) {
  return (unsigned char)ReduceMax_4xU16(PromoteU8ToU16(v));
}

static __attribute__((__noinline__)) void DoTestReduceMax4xU8() {
  const __vector unsigned char kInputVals = {0, 2, 4, 3, 0, 2, 4, 3,
                                             0, 2, 4, 3, 0, 2, 4, 3};

  unsigned long long non_elided_zero;
  __asm__("" : "+r"(non_elided_zero)::);

  unsigned char actual = ReduceMax_4xU8(
      kInputVals | ((__vector unsigned char)vec_splats(non_elided_zero)));
  if (actual != 4) {
    printf("Actual result of ReduceMax_4xU8: %u\n", (unsigned)actual);
    abort();
  }
}

int main(int argc, char** argv) {
  printf("Running test\n");
  DoTestReduceMax4xU8();
  printf("Test completed successfully\n");
  return 0;
}

The above program compiles and runs correctly when compiled for the powerpc64-linux-gnu and powerpc64le-linux-gnu targets with the -O2 -mcpu=power8 options with Clang 18.1.6.

Here is the output that is generated by the above program when compiled for s390x-linux-gnu with the -O2 -mzvector options with Clang 18.1.6:

Running test
Actual result of ReduceMax_4xU8: 2
Aborted (core dumped)

Here is the expected output of the above test program:

Running test
Test completed successfully

johnplatts commented 6 months ago

Here is the generated LLVM IR of ReduceMax_4xU8 that causes DoTestReduceMax4xU8() to fail:

target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
target triple = "s390x-unknown-linux-gnu"

; Function Attrs: nofree nosync nounwind memory(none)
define dso_local zeroext i8 @ReduceMax_4xU8(<16 x i8> noundef %0) local_unnamed_addr #0 {
  %2 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i8> %0, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
  %3 = bitcast <16 x i8> %2 to <8 x i16>
  %4 = tail call <16 x i8> @llvm.s390.vperm(<16 x i8> %2, <16 x i8> %2, <16 x i8> <i8 6, i8 7, i8 4, i8 5, i8 2, i8 3, i8 0, i8 1, i8 14, i8 15, i8 12, i8 13, i8 10, i8 11, i8 8, i8 9>)
  %5 = bitcast <16 x i8> %4 to <8 x i16>
  %6 = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> %3, <8 x i16> %5)
  %7 = bitcast <8 x i16> %6 to <4 x i32>
  %8 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %7, <4 x i32> %7, <4 x i32> <i32 16, i32 16, i32 16, i32 16>)
  %9 = bitcast <4 x i32> %8 to <8 x i16>
  %10 = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> %6, <8 x i16> %9)
  %11 = bitcast <8 x i16> %10 to <16 x i8>
  %12 = extractelement <16 x i8> %11, i64 1
  ret i8 %12
}

; Function Attrs: nofree nosync nounwind memory(none)
declare <16 x i8> @llvm.s390.vperm(<16 x i8>, <16 x i8>, <16 x i8>) #1

; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #2

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>) #3

attributes #0 = { nofree nosync nounwind memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="z13" "target-features"="+transactional-execution,+vector" }
attributes #1 = { nofree nosync nounwind memory(none) }
attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

Replacing %4 = tail call <16 x i8> @llvm.s390.vperm(<16 x i8> %2, <16 x i8> %2, <16 x i8> <i8 6, i8 7, i8 4, i8 5, i8 2, i8 3, i8 0, i8 1, i8 14, i8 15, i8 12, i8 13, i8 10, i8 11, i8 8, i8 9>) in ReduceMax_4xU8 with the following fixes the test failure: %4 = shufflevector <16 x i8> %2, <16 x i8> %2, <16 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 12, i32 13, i32 10, i32 11, i32 8, i32 9>

johnplatts commented 6 months ago

Here is the s390x assembly code that is generated when the above LLVM IR is compiled (with @llvm.s390.vperm):

    .text
    .file   "s390x_zvector_reducemax_4xu8_test_051724_4_routines.ll"
    .globl  ReduceMax_4xU8                  # -- Begin function ReduceMax_4xU8
    .p2align    4
    .type   ReduceMax_4xU8,@function
ReduceMax_4xU8:                         # @ReduceMax_4xU8
.LReduceMax_4xU8$local:
    .type   .LReduceMax_4xU8$local,@function
# %bb.0:
    vuplhb  %v0, %v24
    verllf  %v1, %v0, 16
    vmxlh   %v0, %v0, %v1
    vlgvb   %r0, %v0, 1
    llgcr   %r2, %r0
    br  %r14
.Lfunc_end0:
    .size   ReduceMax_4xU8, .Lfunc_end0-ReduceMax_4xU8
    .size   .LReduceMax_4xU8$local, .Lfunc_end0-ReduceMax_4xU8
                                        # -- End function
    .section    ".note.GNU-stack","",@progbits
    .addrsig

Here is the s390x assembly code that is generated if @llvm.s390.vperm is replaced by shufflevector in ReduceMax_4xU8:

    .text
    .file   "s390x_zvector_reducemax_4xu8_test_051724_4b_routines.ll"
    .section    .rodata.cst16,"aM",@progbits,16
    .p2align    3, 0x0                          # -- Begin function ReduceMax_4xU8
.LCPI0_0:
    .byte   6                               # 0x6
    .byte   7                               # 0x7
    .byte   4                               # 0x4
    .byte   5                               # 0x5
    .byte   2                               # 0x2
    .byte   3                               # 0x3
    .byte   0                               # 0x0
    .byte   1                               # 0x1
    .byte   14                              # 0xe
    .byte   15                              # 0xf
    .byte   12                              # 0xc
    .byte   13                              # 0xd
    .byte   10                              # 0xa
    .byte   11                              # 0xb
    .byte   8                               # 0x8
    .byte   9                               # 0x9
    .text
    .globl  ReduceMax_4xU8
    .p2align    4
    .type   ReduceMax_4xU8,@function
ReduceMax_4xU8:                         # @ReduceMax_4xU8
.LReduceMax_4xU8$local:
    .type   .LReduceMax_4xU8$local,@function
# %bb.0:
    larl    %r1, .LCPI0_0
    vl  %v1, 0(%r1), 3
    vuplhb  %v0, %v24
    vperm   %v1, %v0, %v0, %v1
    vmxlh   %v0, %v0, %v1
    verllf  %v1, %v0, 16
    vmxlh   %v0, %v0, %v1
    vlgvb   %r0, %v0, 1
    llgcr   %r2, %r0
    br  %r14
.Lfunc_end0:
    .size   ReduceMax_4xU8, .Lfunc_end0-ReduceMax_4xU8
    .size   .LReduceMax_4xU8$local, .Lfunc_end0-ReduceMax_4xU8
                                        # -- End function
    .section    ".note.GNU-stack","",@progbits
    .addrsig

The issue is with the LLVM s390x backend as the LLVM s390x backend incorrectly optimizes out the @llvm.s390.vperm call whereas shufflevector is not optimized out.

johnplatts commented 6 months ago

The bug with vec_perm with Clang 18.1.6 on Z14 turns out to be an issue in the LLVM backend, which is described in issue #92615.

llvm / llvm-project

ReduceMax_4xU8 test program fails on s390x when compiled with Clang 18.1.6 #92602