builtin-arith-overflow-12.c: Incorrect return value from `__builtin_mul_overflow`

In the latest branch arc-2024.06, an anomaly has been detected in a test case builtin-arith-overflow-12.c across multiple configurations — archs, em, em4_fpuda, arcem, and more.

Below is a simplified test case along with the execution steps:

$ cat test.c
#include <stdio.h>

__attribute__((noinline, noclone)) signed int t108_1mul (unsigned int x, unsigned int y) {
  signed int r;
  if (__builtin_mul_overflow (x, y, &r)) {
    printf("Overflow! - t108_1mul\n");
  }
  return r; 
}

int
main (void)
{
  unsigned int x = (0x7fffffff / 31);
  unsigned int y = (32);
  t108_1mul (x, y);

  return 0;
}
$ arc-elf32-gcc                                         \
        -fdiagnostics-plain-output                      \
        -O2 --specs=nsim.specs                          \
        -Wl,--defsym=__DEFAULT_HEAP_SIZE=256m           \
        -Wl,--defsym=__DEFAULT_STACK_SIZE=1024m         \
        -lm test.c -o ./test.x

$ qemu-system-arc                                       \
        -semihosting -cpu archs -M arc-sim              \
        -m 2G -nographic -no-reboot -monitor none       \
        -kernel ./test.x

The issue is that the __builtin_mul_overflow function is expected to return true for overflow but it does not, unlike in the arc-2023.09 branch.

By removing the noinline attribute, the internal function does return true for the overflow.

2023.09

         .file   "test.c"
        .cpu HS
        .arc_attribute Tag_ARC_PCS_config, 2
        .arc_attribute Tag_ARC_ABI_rf16, 0
        .arc_attribute Tag_ARC_ABI_pic, 0
        .arc_attribute Tag_ARC_ABI_tls, 0
        .arc_attribute Tag_ARC_ABI_sda, 2
        .arc_attribute Tag_ARC_ABI_exceptions, 0
        .arc_attribute Tag_ARC_CPU_variation, 2
        .section        .text
        .section        .rodata.str1.4,"aMS",@progbits,1
        .align 4
.LC0:
        .string "deu overflow! - t108_1mul"
        .section        .text
        .align 4
        .global t108_1mul
        .type   t108_1mul, @function
t108_1mul:
        push_s  blink
        push_s  r13
        mpymu   r5,r0,r1
        mpy     r4,r0,r1
        mov_s   r3,r5   ;4
        tst r4,r4
        setne r3, r3, 0
        mov_s   r12,1   ;3
        mov.p r12,r3
        mov_s   r3,r12  ;4
        tst_s r12,r12
        mov_s.ne        r0,@.LC0
        blne.d @puts;1
        mov_s   r13,r4  ;4
        mov_s   r0,r13  ;4
        ld      blink,[sp,4]    ;23
        j_s.d   [blink]
        ld.ab   r13,[sp,8]      ;23
        .size   t108_1mul, .-t108_1mul
        .section        .text.startup,"ax",@progbits
        .align 4
        .global main
        .type   main, @function
main:
        push_s  blink
        mov_s   r0,69273666     ;13
        bl.d @t108_1mul;1
        mov_s   r1,32   ;3
        mov_s   r0,0    ;3
        pop_s   blink
        j_s     [blink]
        .size   main, .-main
        .ident  "GCC: (GNU) 13.1.1 20230516"
        .section        .note.GNU-stack,"",@progbits

2024.06

        .file   "test.c"
        .cpu HS
        .arc_attribute Tag_ARC_PCS_config, 2
        .arc_attribute Tag_ARC_ABI_rf16, 0
        .arc_attribute Tag_ARC_ABI_pic, 0
        .arc_attribute Tag_ARC_ABI_tls, 0
        .arc_attribute Tag_ARC_ABI_sda, 2
        .arc_attribute Tag_ARC_ABI_exceptions, 0
        .arc_attribute Tag_ARC_CPU_variation, 2
        .section        .text
        .section        .rodata.str1.4,"aMS",@progbits,1
        .align 4
.LC0:
        .string "deu overflow! - t108_1mul"
        .section        .text
        .align 4
        .global t108_1mul
        .type   t108_1mul, @function
t108_1mul:
        push_s  blink
        push_s  r13
        mpy.f   0,r0,r1
        mpy     r2,r0,r1
        mpymu   r3,r0,r1
        mov.p   r2,r3
        mov.n   r2,1
        setne.p r2,r2,0
        tst_s   r2,r2
        mpy     r13,r0,r1
        mov_s.ne        r0,@.LC0
        blne    @puts;1
        mov_s   r0,r13  ;4
        ld      blink,[sp,4]    ;23
        j_s.d   [blink]
        ld.ab   r13,[sp,8]      ;23
        .size   t108_1mul, .-t108_1mul
        .section        .text.startup,"ax",@progbits
        .align 4
        .global main
        .type   main, @function
main:
        push_s  blink
        mov_s   r0,69273666     ;13
        bl.d    @t108_1mul;1
        mov_s   r1,32   ;3
        mov_s   r0,0    ;3
        pop_s   blink
        j_s     [blink]
        .size   main, .-main
        .ident  "GCC: (GNU) 14.1.0"
        .section        .note.GNU-stack,"",@progbits

Original issue: https://github.com/foss-for-synopsys-dwc-arc-processors/toolchain/issues/627

This issue has been introduced by the following GCC non-related ARCv2 patch:

https://github.com/foss-for-synopsys-dwc-arc-processors/gcc/commit/86de9b66480b710202a2898cf513db105d8c432f

commit 86de9b66480b710202a2898cf513db105d8c432f
Author: Roger Sayle <roger@nextmovesoftware.com>
Date:   Sun Jan 21 21:22:28 2024 +0000

    PR rtl-optimization/111267: Improved forward propagation.

    This patch resolves PR rtl-optimization/111267 by improving RTL-level
    forward propagation.  This x86_64 code quality regression was caused
    (exposed) by my changes to improve how x86's (TImode) argument passing
    is represented at the RTL-level (reducing the use of SUBREGs to catch
    more optimization opportunities in combine).  The pitfall is that the
    more complex RTL representations expose a limitation in RTL's fwprop
    pass.

    At the heart of fwprop, in try_fwprop_subst_pattern, the logic can
    be summarized as three steps.  Step 1 is a heuristic that rejects the
    propagation attempt if the expression is too complex, step 2 calls
    the backend's recog to see if the propagated/simplified instruction
    is recognizable/valid, and step 3 then calls set_src_cost to compare
    the rtx costs of the replacement vs. the original, and accepts the
    transformation if the final cost is the same of better.

    The logic error (or missed optimization opportunity) is that the
    step 1 heuristic that attempts to predict (second guess) the
    process is flawed.  Ultimately the decision on whether to fwprop
    or not should depend solely on actual improvement, as measured
    by RTX costs.  Hence the prototype fix in the bugzilla PR removes
    the heuristic of calling prop.profitable_p entirely, relying
    entirely on the cost comparison in step 3.

    Unfortunately, things are a tiny bit more complicated.  The cost
    comparison in fwprop uses the older set_src_cost API and not the
    newer (preffered) insn_cost API as currently used in combine.
    This means that the cost improvement comparisons are only done
    for single_set instructions (more complex PARALLELs etc. aren't
    supported).  Hence we can only rely on skipping step 1 for that
    subset of instructions actually evaluated by step 3.

    The other subtlety is that to avoid potential infinite loops
    in fwprop we should only reply purely on rtx costs when the
    transformation is obviously an improvement.  If the replacement
    has the same cost as the original, we can use the prop.profitable_p
    test to preserve the current behavior.

    Finally, to answer Richard Biener's remaining question about this
    approach: yes, there is an asymmetry between how patterns are
    handled and how REG_EQUAL notes are handled.  For example, at
    the moment propagation into notes doesn't use rtx costs at all,
    and ultimately when fwprop is updated to use insn_cost, this
    (and recog) obviously isn't applicable to notes.  There's no reason
    the logic need be identical between patterns and notes, and during
    stage4 we only need update propagation into patterns to fix this
    P1 regression (notes and use of cost_insn can be done for GCC 15).

    For Jakub's reduced testcase:

    struct S { float a, b, c, d; };
    int bar (struct S x, struct S y) {
      return x.b <= y.d && x.c >= y.a;
    }

    On x86_64-pc-linux-gnu with -O2 gcc currently generates:

    bar:    movq    %xmm2, %rdx
            movq    %xmm3, %rax
            movq    %xmm0, %rsi
            xchgq   %rdx, %rax
            movq    %rsi, %rcx
            movq    %rax, %rsi
            movq    %rdx, %rax
            shrq    $32, %rcx
            shrq    $32, %rax
            movd    %ecx, %xmm4
            movd    %eax, %xmm0
            comiss  %xmm4, %xmm0
            jb      .L6
            movd    %esi, %xmm0
            xorl    %eax, %eax
            comiss  %xmm0, %xmm1
            setnb   %al
            ret
    .L6:    xorl    %eax, %eax
            ret

    with this simple patch to fwprop, we now generate:

    bar:    shufps  $85, %xmm0, %xmm0
            shufps  $85, %xmm3, %xmm3
            comiss  %xmm0, %xmm3
            jb      .L6
            xorl    %eax, %eax
            comiss  %xmm2, %xmm1
            setnb   %al
            ret
    .L6:    xorl    %eax, %eax
            ret

    2024-01-21  Roger Sayle  <roger@nextmovesoftware.com>
                Richard Biener  <rguenther@suse.de>

    gcc/ChangeLog
            PR rtl-optimization/111267
            * fwprop.cc (fwprop_propagation::profitabe_p): Rename
            profitable_p method to likely_profitable_p.
            (try_fwprop_subst_node): Update call to likely_profitable_p.
            Only bail-out early when !prop.likely_profitable_p for instructions
            that are not single sets.  When comparing costs, bail-out if the
            cost is unchanged and !prop.likely_profitable_p.

    gcc/testsuite/ChangeLog
            PR rtl-optimization/111267
            * gcc.target/i386/pr111267.c: New test case.

foss-for-synopsys-dwc-arc-processors / toolchain

builtin-arith-overflow-12.c: Incorrect return value from `__builtin_mul_overflow` #630