llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
27.84k stars 11.47k forks source link

-msse4.2 isel loses select #53938

Open vitalybuka opened 2 years ago

vitalybuka commented 2 years ago

Reproducer

// cat llvm-project/compiler-rt/test/msan/debug_sse.cpp
// ninja check-msan
// RUN: %clangxx_msan -fsanitize-memory-track-origins -O2 -fsanitize-memory-param-retval %s -o %t && %run %t
// RUN: %clangxx_msan -fsanitize-memory-track-origins -O2 -fsanitize-memory-param-retval -msse4.2  %s -o %t && %run %t

#include <assert.h>
#include <malloc.h>
#include <signal.h>
#include <stdio.h>
#include <string.h>

#include <sanitizer/msan_interface.h>

const int N = 2;
double a[N] = {};
double b[N] = {};

static inline double sign(double x) { return (x == .0 ? .0 : (x < .0 ? -1.0 : 1.0)); }

__attribute__((noinline)) static void myloop() {
  for (int i = 0; i < N; i++) {
    a[i] = (sign(b[i]) != sign(a[i])) ? (a[i] + .2) : (a[i] * .8);
  }
}

int main(void) {
  __msan_print_shadow(a, sizeof(a));
  myloop();
  __msan_print_shadow(a, sizeof(a));
  __msan_check_mem_is_initialized(a, sizeof(a));
  return a[0];
}

Output no-sse4.2:

Shadow map [0x500001eafee0, 0x500001eafef0) of [0x000001eafee0, 0x000001eafef0), 16 bytes:
0x500001eafee0[0x000001eafee0]: 00000000 00000000 00000000 00000000  |. . . .|

Shadow map [0x500001eafee0, 0x500001eafef0) of [0x000001eafee0, 0x000001eafef0), 16 bytes:
0x500001eafee0[0x000001eafee0]: 00000000 00000000 00000000 00000000  |. . . .|

sse4.2:

Shadow map [0x500001eafee0, 0x500001eafef0) of [0x000001eafee0, 0x000001eafef0), 16 bytes:
0x500001eafee0[0x000001eafee0]: 00000000 00000000 00000000 00000000  |. . . .|

Shadow map [0x500001eafee0, 0x500001eafef0) of [0x000001eafee0, 0x000001eafef0), 16 bytes:
0x500001eafee0[0x000001eafee0]: 9a999999 9999c93f 9a999999 9999c93f  |A A A A|

Origin A (origin_id 90000001):
  Uninitialized value was stored to memory at
...
  ==1406286==WARNING: MemorySanitizer: use-of-uninitialized-value

IR in both cases are identical (only difference is target-features):

; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind sanitize_memory uwtable
define internal fastcc void @_ZL6myloopv() unnamed_addr #2 {
entry:
  %0 = load <2 x double>, <2 x double>* bitcast ([2 x double]* @b to <2 x double>*), align 16, !tbaa !4
  %_msld = load <2 x i64>, <2 x i64>* inttoptr (i64 xor (i64 ptrtoint ([2 x double]* @b to i64), i64 87960930222080) to <2 x i64>*), align 16
  %1 = load i32, i32* inttoptr (i64 add (i64 xor (i64 ptrtoint ([2 x double]* @b to i64), i64 87960930222080), i64 17592186044416) to i32*), align 16
  %2 = trunc <2 x i64> %_msld to <2 x i1>
  %3 = fcmp oeq <2 x double> %0, zeroinitializer
  %4 = fcmp olt <2 x double> %0, zeroinitializer
  %_msprop_select = select <2 x i1> %2, <2 x i64> <i64 -9223372036854775808, i64 -9223372036854775808>, <2 x i64> zeroinitializer
  %5 = bitcast <2 x i1> %4 to i2
  %6 = icmp ne i2 %5, 0
  %7 = bitcast <2 x i1> %2 to i2
  %8 = icmp ne i2 %7, 0
  %9 = select i1 %8, i32 %1, i32 0
  %10 = select <2 x i1> %4, <2 x double> <double -1.000000e+00, double -1.000000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
  %11 = select <2 x i1> %3, <2 x i64> zeroinitializer, <2 x i64> %_msprop_select
  %12 = bitcast <2 x double> %10 to <2 x i64>
  %13 = or <2 x i64> %12, %_msprop_select
  %_msprop_select25 = select <2 x i1> %2, <2 x i64> %13, <2 x i64> %11
  %14 = bitcast <2 x i1> %3 to i2
  %15 = icmp ne i2 %14, 0
  %16 = select i1 %15, i32 0, i32 %9
  %17 = select i1 %8, i32 %1, i32 %16
  %18 = select <2 x i1> %3, <2 x double> zeroinitializer, <2 x double> %10
  %19 = load <2 x double>, <2 x double>* bitcast ([2 x double]* @a to <2 x double>*), align 16, !tbaa !4
  %_msld26 = load <2 x i64>, <2 x i64>* inttoptr (i64 xor (i64 ptrtoint ([2 x double]* @a to i64), i64 87960930222080) to <2 x i64>*), align 16
  %20 = load i32, i32* inttoptr (i64 add (i64 xor (i64 ptrtoint ([2 x double]* @a to i64), i64 87960930222080), i64 17592186044416) to i32*), align 16
  %21 = trunc <2 x i64> %_msld26 to <2 x i1>
  %22 = fcmp oeq <2 x double> %19, zeroinitializer
  %23 = fcmp olt <2 x double> %19, zeroinitializer
  %_msprop_select29 = select <2 x i1> %21, <2 x i64> <i64 -9223372036854775808, i64 -9223372036854775808>, <2 x i64> zeroinitializer
  %24 = bitcast <2 x i1> %23 to i2
  %25 = icmp ne i2 %24, 0
  %26 = bitcast <2 x i1> %21 to i2
  %27 = icmp ne i2 %26, 0
  %28 = select i1 %27, i32 %20, i32 0
  %29 = select <2 x i1> %23, <2 x double> <double -1.000000e+00, double -1.000000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
  %30 = select <2 x i1> %22, <2 x i64> zeroinitializer, <2 x i64> %_msprop_select29
  %31 = bitcast <2 x double> %29 to <2 x i64>
  %32 = or <2 x i64> %31, %_msprop_select29
  %_msprop_select30 = select <2 x i1> %21, <2 x i64> %32, <2 x i64> %30
  %33 = bitcast <2 x i1> %22 to i2
  %34 = icmp ne i2 %33, 0
  %35 = select i1 %34, i32 0, i32 %28
  %36 = select i1 %27, i32 %20, i32 %35
  %37 = select <2 x i1> %22, <2 x double> zeroinitializer, <2 x double> %29
  %_msprop31 = or <2 x i64> %_msprop_select25, %_msprop_select30
  %38 = bitcast <2 x i64> %_msprop_select30 to i128
  %39 = icmp ne i128 %38, 0
  %40 = select i1 %39, i32 %36, i32 %17
  %41 = trunc <2 x i64> %_msprop31 to <2 x i1>
  %42 = fcmp une <2 x double> %18, %37
  %43 = fadd <2 x double> %19, <double 2.000000e-01, double 2.000000e-01>
  %44 = fmul <2 x double> %19, <double 8.000000e-01, double 8.000000e-01>
  %45 = bitcast <2 x double> %43 to <2 x i64>
  %46 = bitcast <2 x double> %44 to <2 x i64>
  %47 = xor <2 x i64> %45, %46
  %48 = or <2 x i64> %47, %_msld26
  %_msprop_select34 = select <2 x i1> %41, <2 x i64> %48, <2 x i64> %_msld26
  %49 = bitcast <2 x i1> %42 to i2
  %50 = icmp ne i2 %49, 0
  %51 = bitcast <2 x i1> %41 to i2
  %52 = icmp ne i2 %51, 0
  %53 = select i1 %52, i32 %40, i32 %20
  %54 = select <2 x i1> %42, <2 x double> %43, <2 x double> %44

  ; THIS STORES DIFFERENT VALUE sse/nosse
  store <2 x i64> %_msprop_select34, <2 x i64>* inttoptr (i64 xor (i64 ptrtoint ([2 x double]* @a to i64), i64 87960930222080) to <2 x i64>*), align 16
  %55 = bitcast <2 x i64> %_msprop_select34 to i128
  %_mscmp = icmp ne i128 %55, 0
  br i1 %_mscmp, label %56, label %61, !prof !8

56:                                               ; preds = %entry
  %57 = call i32 @__msan_chain_origin(i32 %53)
  %58 = zext i32 %57 to i64
  %59 = shl i64 %58, 32
  %60 = or i64 %58, %59
  store i64 %60, i64* inttoptr (i64 add (i64 xor (i64 ptrtoint ([2 x double]* @a to i64), i64 87960930222080), i64 17592186044416) to i64*), align 16
  store i64 %60, i64* getelementptr (i64, i64* inttoptr (i64 add (i64 xor (i64 ptrtoint ([2 x double]* @a to i64), i64 87960930222080), i64 17592186044416) to i64*), i32 1), align 8
  br label %61

61:                                               ; preds = %entry, %56
  store <2 x double> %54, <2 x double>* bitcast ([2 x double]* @a to <2 x double>*), align 16, !tbaa !4
  ret void
}

From what I see this program must store zeroes if %_msld and %_msld26 loaded zeroes (which is true from printout)

I see nothing unusual in isel without SSE4.2 However with sse4.2:

# *** IR Dump After X86 DAG->DAG Instruction Selection (amdgpu-isel) ***:
# Machine code for function _ZL6myloopv: IsSSA, TracksLiveness

  %50:vr128 = nofpexcept CMPPDrri %39:vr128(tied-def 0), killed %47:vr128, 4, implicit $mxcsr
  %51:vr128 = nofpexcept ADDPDrm %3:vr128(tied-def 0), $rip, 1, $noreg, %const.3, $noreg, implicit $mxcsr :: (load (s128) from constant-pool)
  %52:vr128 = nofpexcept MULPDrm %3:vr128(tied-def 0), $rip, 1, $noreg, %const.4, $noreg, implicit $mxcsr :: (load (s128) from constant-pool)
  %53:vr128 = PXORrr %51:vr128(tied-def 0), %52:vr128
  %54:vr128 = PORrr %53:vr128(tied-def 0), %7:vr128
  %55:vr128 = PSLLQri %48:vr128(tied-def 0), 63
  %56:gr32 = MOVMSKPDrr killed %55:vr128
  TEST32rr %56:gr32, %56:gr32, implicit-def $eflags
  %0:gr32 = CMOV32rr %9:gr32(tied-def 0), killed %49:gr32, 5, implicit $eflags
  $xmm0 = COPY %50:vr128
  %1:vr128 = BLENDVPDrr0 %52:vr128(tied-def 0), %51:vr128, implicit $xmm0
  MOVAPSmr %6:gr64, 1, $noreg, 0, $noreg, %54:vr128 :: (store (s128) into `<2 x i64>* inttoptr (i64 xor (i64 ptrtoint ([2 x double]* @a to i64), i64 87960930222080) to <2 x i64>*)`)
  PTESTrr %54:vr128, %54:vr128, implicit-def $eflags
  JCC_1 %bb.2, 4, implicit $eflags

Problem is that with sse4.2 we store: %54:vr128 = (ADDPDrm() ^ MULPDrm()) | %7:vr128

However expected value should be like this (I see it without -msse4.2): %54:vr128 = trunk(_msprop31) ? ((ADDPDrm() ^ MULPDrm()) | %7:vr128) : _msld26

llvmbot commented 2 years ago

@llvm/issue-subscribers-backend-x86