Quuxplusone / LLVMBugzillaTest

0 stars 0 forks source link

#pragma clang loop vectorize(disable) should actually disable vectorization #42509

Open Quuxplusone opened 4 years ago

Quuxplusone commented 4 years ago
Bugzilla Link PR43539
Status NEW
Importance P enhancement
Reported by Devin Hussey (husseydevin@gmail.com)
Reported on 2019-10-02 16:45:14 -0700
Last modified on 2020-08-07 21:30:16 -0700
Version trunk
Hardware Other Linux
CC efriedma@quicinc.com, fj8765ah@aa.jp.fujitsu.com, florian_hahn@apple.com, hfinkel@anl.gov, llvm-bugs@lists.llvm.org, llvm@meinersbur.de, neeilans@live.com, richard-llvm@metafoo.co.uk
Fixed by commit(s)
Attachments
Blocks
Blocked by
See also
**#pragma clang loop vectorize(disable) is useless.**

It only works on tiny loops. Anything relatively complicated will be vectorized
regardless.

(I'm guessing that the cost model overrides it):

#include <stdint.h>
#include <stddef.h>

uint64_t some_func(uint64_t *p, size_t len)
{
    uint64_t sum1 = 0, sum2 = 0;
    len &= ~(size_t)3; // always multiple of 4

#pragma clang loop vectorize(disable)
    for (size_t i = 0; i < len; i++) {
        sum1 += *p++ ^ 12345;
        sum2 += *p++ ^ 12345;
    }
    return sum1 ^ sum2;
}

I'd expect this for aarch64 with -O3 (actual output from -march=armv8-a+nosimd):

some_func:
// %bb.0:
        mov     x8, xzr
        ands    x10, x1, #0xfffffffffffffffc
        b.eq    .LBB0_4
// %bb.1:
        mov     x9, xzr
        mov     x11, xzr
        mov     x12, xzr
        add     x13, x0, #16
        mov     w14, #12345
.LBB0_2:
        ldp     x15, x16, [x13, #-16]
        ldp     x17, x0, [x13], #32
        subs    x10, x10, #2
        eor     x15, x15, x14
        eor     x17, x17, x14
        eor     x16, x16, x14
        eor     x0, x0, x14
        add     x11, x15, x11
        add     x12, x17, x12
        add     x8, x16, x8
        add     x9, x0, x9
        b.ne    .LBB0_2
// %bb.3:
        add     x10, x12, x11
        add     x8, x9, x8
.LBB0_4:
        eor     x0, x8, x10
        ret

However, I get this, a clearly vectorized loop:

some_func:
// %bb.0:
        ands    x8, x1, #0xfffffffffffffffc
        b.eq    .LBB0_4
// %bb.1:
        mov     w10, #12345
        add     x9, x0, #16
        movi    v0.2d, #0000000000000000
        dup     v1.2d, x10
        movi    v2.2d, #0000000000000000
.LBB0_2:
        ldp     q3, q4, [x9, #-16]
        subs    x8, x8, #2
        add     x9, x9, #32
        eor     v3.16b, v3.16b, v1.16b
        eor     v4.16b, v4.16b, v1.16b
        add     v2.2d, v3.2d, v2.2d
        add     v0.2d, v4.2d, v0.2d
        b.ne    .LBB0_2
// %bb.3:
        add     v0.2d, v0.2d, v2.2d
        fmov    x8, d0
        mov     x9, v0.d[1]
        eor     x0, x9, x8
        ret
.LBB0_4:
        movi    v0.2d, #0000000000000000
        fmov    x8, d0
        mov     x9, v0.d[1]
        eor     x0, x9, x8
        ret

Similar things are output on SSE2, NEON32, etc.

#pragma clang loop vectorize(disable) needs to completely shut off
vectorization for the loop.

Need I say more?
Quuxplusone commented 4 years ago

For more info, it seems that Clang tries to disable vectorization with llvm.loop.vectorize.width=1, but vectorized llvm intrinsics are used anyways.

Quuxplusone commented 4 years ago

Hrmm. Could be that the metadata is being dropped, or that you're actually seeing unrolling plus SLP or backend vectorization?

Quuxplusone commented 4 years ago
Alright, turns out this was more opt's problem.

If you emit llvm assembly with -O1, you get this:

define dso_local i64 @randomfunc(i64* nocapture readonly, i64)
local_unnamed_addr #0 {
  %3 = and i64 %1, -4
  %4 = icmp eq i64 %3, 0
  br i1 %4, label %5, label %9

; <label>:5:                                      ; preds = %9, %2
  %6 = phi i64 [ 0, %2 ], [ %17, %9 ]
  %7 = phi i64 [ 0, %2 ], [ %21, %9 ]
  %8 = xor i64 %7, %6
  ret i64 %8

; <label>:9:                                      ; preds = %2, %9
  %10 = phi i64 [ %22, %9 ], [ 0, %2 ]
  %11 = phi i64 [ %21, %9 ], [ 0, %2 ]
  %12 = phi i64 [ %17, %9 ], [ 0, %2 ]
  %13 = phi i64* [ %18, %9 ], [ %0, %2 ]
  %14 = getelementptr inbounds i64, i64* %13, i64 1
  %15 = load i64, i64* %13, align 8, !tbaa !4
  %16 = xor i64 %15, 12345
  %17 = add i64 %16, %12
  %18 = getelementptr inbounds i64, i64* %13, i64 2
  %19 = load i64, i64* %14, align 8, !tbaa !4
  %20 = xor i64 %19, 12345
  %21 = add i64 %20, %11
  %22 = add nuw i64 %10, 1
  %23 = icmp eq i64 %22, %3
  br i1 %23, label %5, label %9, !llvm.loop !8
}
...

!8 = distinct !{!8, !9}
!9 = !{!"llvm.loop.vectorize.width", i32 1}

If you run opt with the flags from `clang -O3 -mllvm -debug-pass=Arguments`,
you can see that the loop is vectorized:

define dso_local i64 @randomfunc(i64* nocapture readonly, i64)
local_unnamed_addr #0 {
  %3 = and i64 %1, -4
  %4 = icmp eq i64 %3, 0
  br i1 %4, label %.loopexit, label %vector.body

vector.body:                                      ; preds = %2, %vector.body
  %index = phi i64 [ %index.next, %vector.body ], [ 0, %2 ]
  %5 = phi <2 x i64> [ %17, %vector.body ], [ zeroinitializer, %2 ]
  %6 = phi <2 x i64> [ %16, %vector.body ], [ zeroinitializer, %2 ]
  %7 = shl i64 %index, 1
  %next.gep = getelementptr i64, i64* %0, i64 %7
  %8 = shl i64 %index, 1
  %9 = or i64 %8, 2
  %next.gep10 = getelementptr i64, i64* %0, i64 %9
  %10 = bitcast i64* %next.gep to <2 x i64>*
  %11 = load <2 x i64>, <2 x i64>* %10, align 8, !tbaa !4
  %12 = bitcast i64* %next.gep10 to <2 x i64>*
  %13 = load <2 x i64>, <2 x i64>* %12, align 8, !tbaa !4
  %14 = xor <2 x i64> %11, <i64 12345, i64 12345>
  %15 = xor <2 x i64> %13, <i64 12345, i64 12345>
  %16 = add <2 x i64> %14, %6
  %17 = add <2 x i64> %15, %5
  %index.next = add i64 %index, 2
  %18 = icmp eq i64 %index.next, %3
  br i1 %18, label %middle.block, label %vector.body, !llvm.loop !8

middle.block:                                     ; preds = %vector.body
  %19 = add <2 x i64> %17, %16
  br label %.loopexit

.loopexit:                                        ; preds = %middle.block, %2
  %20 = phi <2 x i64> [ zeroinitializer, %2 ], [ %19, %middle.block ]
  %21 = extractelement <2 x i64> %20, i32 0
  %22 = extractelement <2 x i64> %20, i32 1
  %23 = xor i64 %22, %21
  ret i64 %23
}

!8 = distinct !{!8, !9, !10}
!9 = !{!"llvm.loop.vectorize.width", i32 1}
!10 = !{!"llvm.loop.isvectorized", i32 1}

It turns out that in order to disable vectorization, we should emit
llvm.loop.vectorize.enable=false. This actually turns off vectorization!

(We should also tell opt to obey the width parameter)
Quuxplusone commented 4 years ago
(In reply to Hal Finkel from comment #2)
> Hrmm. Could be that the metadata is being dropped

Ignored, but yes.

So in short:

Clang emits llvm.loop.vectorize.width=1 with #pragma clang loop
vectorize(disable)

opt interprets llvm.loop.vectorize.width=1 as a hint to increase the cost of
vectorization, not disable it.

On expensive loops, the cost model favors vectorization, and opt emits
vectorized code.

opt actually uses llvm.loop.vectorize.enable=false to disable vectorization.
Quuxplusone commented 4 years ago

Actually, it seems a little more complicated than that, as it looks like opt might still be vectorizing some loops.

Quuxplusone commented 4 years ago
Optimization remarks for the original example (-Rpass=vectorize):

<stdin>:10:5: remark: interleaved loop (interleaved count: 2) [-Rpass=loop-
vectorize]
    for (size_t i = 0; i < len; i++) {
    ^
<stdin>:4:10: note: could not determine the original source location for
<stdin>:0:0
uint64_t some_func(uint64_t *p, size_t len)
         ^
<stdin>:4:10: remark: SLP vectorized with cost -4 and with tree size 15 [-
Rpass=slp-vectorizer]
uint64_t some_func(uint64_t *p, size_t len)
         ^

So the loop isn't actually being vectorized by the vectorizer; it's getting
unrolled, then vectorized by the SLP vectorizer.
Quuxplusone commented 4 years ago
To avoid this form of vectorization,

    #pragma clang loop vectorize(disable) interleave(disable) unroll(disable)

should do it.
Quuxplusone commented 4 years ago

Not good enough.

My request is simple:

pragma clang loop vectorize(disable) should disable vectorization completely. I shouldn't need to disable unrolling.

Is that too much?

Quuxplusone commented 4 years ago

(In reply to Devin Hussey from comment #8)

Not good enough.

My request is simple:

pragma clang loop vectorize(disable) should disable vectorization

completely. I shouldn't need to disable unrolling.

Is that too much?

The problem here is that loop vectorize(disable) just disables loop vectorization. There are other passes in LLVM which can also create vector code in a loop, but technically are not doing explicit loop vectorization.

It sounds like you would like a way to disable all vector code generation through the frontend, on a loop/function level granularity?

For your example, you can explicitly disable SLP vectorization (-fno-slp-vectorize) and get no vectorization. But of course something in the backend might still select vector instructions.

Of course you can also disable SIMD support on AArch64 ( -march=armv8-a+nosimd).

Quuxplusone commented 4 years ago
(In reply to Eli Friedman from comment #6)
> Optimization remarks for the original example (-Rpass=vectorize):
>
> <stdin>:10:5: remark: interleaved loop (interleaved count: 2)
> [-Rpass=loop-vectorize]
>     for (size_t i = 0; i < len; i++) {
>     ^

Loops with "#pragma clang loop vectorize(disable)" specified are only
interleaved.
With -fno-vectorize, neither vector nor interleave is done.
The behavior of options and pragma is different. Is it correct?
Quuxplusone commented 4 years ago
> Loops with "#pragma clang loop vectorize(disable)" specified are only
> interleaved.
> With -fno-vectorize, neither vector nor interleave is done.
> The behavior of options and pragma is different. Is it correct?

Not exactly. -fno-vectorize controls the VectorizeOnlyWhenForced parameter of
the LoopVectorize pass. -fno-unroll-loops, in additional to unrolling itself,
also controls the
InterleaveOnlyWhenForced parameter.

See also https://reviews.llvm.org/D61030