Open johnplatts opened 6 months ago
Here is the generated LLVM IR of ReduceMax_4xU8 that causes DoTestReduceMax4xU8() to fail:
target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
target triple = "s390x-unknown-linux-gnu"
; Function Attrs: nofree nosync nounwind memory(none)
define dso_local zeroext i8 @ReduceMax_4xU8(<16 x i8> noundef %0) local_unnamed_addr #0 {
%2 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i8> %0, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
%3 = bitcast <16 x i8> %2 to <8 x i16>
%4 = tail call <16 x i8> @llvm.s390.vperm(<16 x i8> %2, <16 x i8> %2, <16 x i8> <i8 6, i8 7, i8 4, i8 5, i8 2, i8 3, i8 0, i8 1, i8 14, i8 15, i8 12, i8 13, i8 10, i8 11, i8 8, i8 9>)
%5 = bitcast <16 x i8> %4 to <8 x i16>
%6 = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> %3, <8 x i16> %5)
%7 = bitcast <8 x i16> %6 to <4 x i32>
%8 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %7, <4 x i32> %7, <4 x i32> <i32 16, i32 16, i32 16, i32 16>)
%9 = bitcast <4 x i32> %8 to <8 x i16>
%10 = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> %6, <8 x i16> %9)
%11 = bitcast <8 x i16> %10 to <16 x i8>
%12 = extractelement <16 x i8> %11, i64 1
ret i8 %12
}
; Function Attrs: nofree nosync nounwind memory(none)
declare <16 x i8> @llvm.s390.vperm(<16 x i8>, <16 x i8>, <16 x i8>) #1
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #2
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>) #3
attributes #0 = { nofree nosync nounwind memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="z13" "target-features"="+transactional-execution,+vector" }
attributes #1 = { nofree nosync nounwind memory(none) }
attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
Replacing %4 = tail call <16 x i8> @llvm.s390.vperm(<16 x i8> %2, <16 x i8> %2, <16 x i8> <i8 6, i8 7, i8 4, i8 5, i8 2, i8 3, i8 0, i8 1, i8 14, i8 15, i8 12, i8 13, i8 10, i8 11, i8 8, i8 9>)
in ReduceMax_4xU8
with the following fixes the test failure:
%4 = shufflevector <16 x i8> %2, <16 x i8> %2, <16 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 12, i32 13, i32 10, i32 11, i32 8, i32 9>
Here is the s390x assembly code that is generated when the above LLVM IR is compiled (with @llvm.s390.vperm
):
.text
.file "s390x_zvector_reducemax_4xu8_test_051724_4_routines.ll"
.globl ReduceMax_4xU8 # -- Begin function ReduceMax_4xU8
.p2align 4
.type ReduceMax_4xU8,@function
ReduceMax_4xU8: # @ReduceMax_4xU8
.LReduceMax_4xU8$local:
.type .LReduceMax_4xU8$local,@function
# %bb.0:
vuplhb %v0, %v24
verllf %v1, %v0, 16
vmxlh %v0, %v0, %v1
vlgvb %r0, %v0, 1
llgcr %r2, %r0
br %r14
.Lfunc_end0:
.size ReduceMax_4xU8, .Lfunc_end0-ReduceMax_4xU8
.size .LReduceMax_4xU8$local, .Lfunc_end0-ReduceMax_4xU8
# -- End function
.section ".note.GNU-stack","",@progbits
.addrsig
Here is the s390x assembly code that is generated if @llvm.s390.vperm
is replaced by shufflevector
in ReduceMax_4xU8:
.text
.file "s390x_zvector_reducemax_4xu8_test_051724_4b_routines.ll"
.section .rodata.cst16,"aM",@progbits,16
.p2align 3, 0x0 # -- Begin function ReduceMax_4xU8
.LCPI0_0:
.byte 6 # 0x6
.byte 7 # 0x7
.byte 4 # 0x4
.byte 5 # 0x5
.byte 2 # 0x2
.byte 3 # 0x3
.byte 0 # 0x0
.byte 1 # 0x1
.byte 14 # 0xe
.byte 15 # 0xf
.byte 12 # 0xc
.byte 13 # 0xd
.byte 10 # 0xa
.byte 11 # 0xb
.byte 8 # 0x8
.byte 9 # 0x9
.text
.globl ReduceMax_4xU8
.p2align 4
.type ReduceMax_4xU8,@function
ReduceMax_4xU8: # @ReduceMax_4xU8
.LReduceMax_4xU8$local:
.type .LReduceMax_4xU8$local,@function
# %bb.0:
larl %r1, .LCPI0_0
vl %v1, 0(%r1), 3
vuplhb %v0, %v24
vperm %v1, %v0, %v0, %v1
vmxlh %v0, %v0, %v1
verllf %v1, %v0, 16
vmxlh %v0, %v0, %v1
vlgvb %r0, %v0, 1
llgcr %r2, %r0
br %r14
.Lfunc_end0:
.size ReduceMax_4xU8, .Lfunc_end0-ReduceMax_4xU8
.size .LReduceMax_4xU8$local, .Lfunc_end0-ReduceMax_4xU8
# -- End function
.section ".note.GNU-stack","",@progbits
.addrsig
The issue is with the LLVM s390x backend as the LLVM s390x backend incorrectly optimizes out the @llvm.s390.vperm
call whereas shufflevector
is not optimized out.
The bug with vec_perm with Clang 18.1.6 on Z14 turns out to be an issue in the LLVM backend, which is described in issue #92615.
Here is a C test program that fails to execute correctly when compiled for the s390x-linux-gnu target (with the -O2 -mzvector options) with Clang 18.1.6, but passes when compiled for the s390x-linux-gnu target (with the -O2 -mzvector options) with GCC 12.3.0:
The above program compiles and runs correctly when compiled for the powerpc64-linux-gnu and powerpc64le-linux-gnu targets with the -O2 -mcpu=power8 options with Clang 18.1.6.
Here is the output that is generated by the above program when compiled for s390x-linux-gnu with the -O2 -mzvector options with Clang 18.1.6:
Here is the expected output of the above test program: