[AVX-512] Decompose `vpermb` into broadcast+`vpshufb` when possible

Validark commented 3 weeks ago

define dso_local <64 x i8> @foo(<8 x i8> %0) local_unnamed_addr {
Entry:
  %1 = shufflevector <8 x i8> %0, <8 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
  ret <64 x i8> %1
}

Compiled for Zen 4, we get:

.LCPI0_0:
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   1
        .byte   1
        .byte   1
        .byte   1
        .byte   1
        .byte   1
        .byte   1
        .byte   1
        .byte   2
        .byte   2
        .byte   2
        .byte   2
        .byte   2
        .byte   2
        .byte   2
        .byte   2
        .byte   3
        .byte   3
        .byte   3
        .byte   3
        .byte   3
        .byte   3
        .byte   3
        .byte   3
        .byte   4
        .byte   4
        .byte   4
        .byte   4
        .byte   4
        .byte   4
        .byte   4
        .byte   4
        .byte   5
        .byte   5
        .byte   5
        .byte   5
        .byte   5
        .byte   5
        .byte   5
        .byte   5
        .byte   6
        .byte   6
        .byte   6
        .byte   6
        .byte   6
        .byte   6
        .byte   6
        .byte   6
        .byte   7
        .byte   7
        .byte   7
        .byte   7
        .byte   7
        .byte   7
        .byte   7
        .byte   7
foo:
.Lfoo$local:
        vmovdqa64       zmm1, zmmword ptr [rip + .LCPI0_0]
        vpermb  zmm0, zmm1, zmm0
        ret

I think vpermb should be decomposed into vpbroadcastq+vpshufb

foo:
        vpbroadcastq    zmm0, xmm0
        vpshufb  zmm0, zmm0, zmmword ptr [rip + .LCPI0_0]
        ret

llvmbot commented 3 weeks ago

@llvm/issue-subscribers-backend-x86

Author: Niles Salter (Validark)

[LLVM godbolt](https://llvm.godbolt.org/#z:OYLghAFBqd5QCxAYwPYBMCmBRdBLAF1QCcAaPECAMzwBtMA7AQwFtMQByARg9KtQYEAysib0QXACx8BBAKoBnTAAUAHpwAMvAFYTStJg1C1aANxakl9ZATwDKjdAGFUtAK4sGexwBk8DTAA5DwAjTGIQAHZSAAdUBUI7Bhd3Tz04hNsBPwDgljCI6KtMGyShAiZiAhSPLy5LTGsshnLKghyg0PCoywqqmrT6hT72/0787siASktUN2Jkdg4sGgCAanQFVAB9WlRRWjWAUgBmJwA2STXVNbwADlPsNYABflQIU6c769uHk6ejgAmACsGimaz2B22bgYzDY6G2THQ6GIx0iACEjhoAILYQTEACeICx2LWxxBXGOJwAImsFAg3FQqPRTCUiKjPt8bvdHuTQaQqV8fjz/ms4ngtgwBZ9LsKToDeZ88PK1hoBcrAar1Sq1bcddrNbqNVq9YaDSbjfVTWsrZbzbaVQ7NU6bfbzYD3Z7rR7vV7jT7/eaTkGQ9bg2HQ8bw1HzdJrXHjQmVUnNSm1mmM%2BbgVmc9bs3nc8b80XzedS%2BXrWXKxXjVXa%2BbotbG8bmyrW5r22tO93HiSycRMARBbLuX8ARSSUdItTJziODNaJxgbwvNxeKhOD4fAA1ACyawAkgAlOlzBaYcknHikAiaeczADWIHOJwAdICNABOE7fk6SF%2BRJ%2B5zAmWi4cJIK53qQG4cLwCggGqt4cFoMxwLASBoCwMR0OE5CUJh2H0BEeALAYRgEMQMIPnwdAEOE8EQCEUEhP4lREmupAscwhIAPIhNobKcNemFsIIPEMLQ7FaKQWAhG4wBOGItDwRxWAsIYwDiMhvD4AOpSsip0mYKoJRuHRQm8P4dFgdJtB4CExBsS4WBQRReAsBZpCssQITxJg1KYOpRh2UYd4zMyTDAAo254JgADuPExIwnn8IIIhiOwUgyIIigqOo2mkLo9RkcYZgWHZITwZAMyoDEzQqQAtDxJxrA16kLAgpzUgAXgw3lXA1fTAIO1x3Oc2yXA1IVuKo67ecQeBYFVEAzMUpT2BAjgDHUpC%2BGMeQFOk8SJAI21HZkSQdAd3RDI0bJlCMZ23U0D1tFdXQREMj2uLUejDG9%2B0fRIq1noswP6EukEFTBazICVawQBRVHghAuCECQl5cFMvBIShj4gCc5yvoBGjnKTwGEx%2BhMQ%2BBvAeVwGhqqu0kwXBCE3mFC6cICUMs5wOOc159FJCAkhAA%3D%3D%3D) ```llvm define dso_local <64 x i8> @foo(<8 x i8> %0) local_unnamed_addr { Entry: %1 = shufflevector <8 x i8> %0, <8 x i8> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> ret <64 x i8> %1 } ``` Compiled for Zen 4, we get: ```asm .LCPI0_0: .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 1 .byte 1 .byte 1 .byte 1 .byte 1 .byte 1 .byte 1 .byte 1 .byte 2 .byte 2 .byte 2 .byte 2 .byte 2 .byte 2 .byte 2 .byte 2 .byte 3 .byte 3 .byte 3 .byte 3 .byte 3 .byte 3 .byte 3 .byte 3 .byte 4 .byte 4 .byte 4 .byte 4 .byte 4 .byte 4 .byte 4 .byte 4 .byte 5 .byte 5 .byte 5 .byte 5 .byte 5 .byte 5 .byte 5 .byte 5 .byte 6 .byte 6 .byte 6 .byte 6 .byte 6 .byte 6 .byte 6 .byte 6 .byte 7 .byte 7 .byte 7 .byte 7 .byte 7 .byte 7 .byte 7 .byte 7 foo: .Lfoo$local: vmovdqa64 zmm1, zmmword ptr [rip + .LCPI0_0] vpermb zmm0, zmm1, zmm0 ret ``` I think `vpermb` should be decomposed into `vpbroadcastq`+`vpshufb` ```asm foo: vpbroadcastq zmm0, xmm0 vpshufb zmm0, zmm0, zmmword ptr [rip + .LCPI0_0] ret ```

dzaima commented 3 weeks ago

Same as https://github.com/llvm/llvm-project/issues/66150 I think.

RKSimon commented 1 week ago

7c3bbfdcf62e0a8806e3ae3130e8dc537fe5c775 ensures that the VBROADCAST+PSHUFB pattern is created, and we now get this result for AVX512 targets without VBMI (like skylake-avx512):

    vpbroadcastq    %xmm0, %zmm0
    vpshufb .LCPI0_0(%rip), %zmm0, %zmm0    # zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
    retq

But on targets with VBMI (e.g. tigerlake and znver5), shuffle combining then comes along and treats the PSHUFB variable shuffle as expensive as VPERMB and so merges with the VBROADCAST:

    vmovdqa64   .LCPI0_0(%rip), %zmm1   # zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
                                        # kill: def $xmm0 killed $xmm0 def $zmm0
    vpermb  %zmm0, %zmm1, %zmm0
    retq

We need combineX86ShuffleChain to not just be told when we've combined with a variable shuffle mask, but whether it was a cheap inlane mask or expensive crosslane mask.

pcordes commented 4 days ago

This is worse on other uarches (with full-width 512-bit vector ALUs, like Zen 5 and Intel), and only sometimes better on Zen 4. It's spending an extra 512-bit uop for ports FP12 (the shuffle units) to save 3 cycles of latency from shuffle input to output. It only helps if that's part of a long enough dependency chain to create a bottleneck. (The extra ALU uops only hurt if that was the bottleneck, or if getting them executed delays something else, or if sharing a physical core with another thread that could have used those cycles.)

But often SIMD code gets inlined into loops. When the shuffle constant is hoisted into a register, it's either vpermb %zmm, %zmm, %zmm vs. vpbroadcastq %xmm, %zmm ; vpshufb %zmm, %zmm, %zmm, so vpermb saves uops for the front end and ROB as well, letting out-of-order exec see farther.

So it's very situational and shouldn't be the default; that would pessimize code that was expecting a single vpermb in a loop for throughput. It's also common for SIMD loops to bottleneck on throughput, not latency, especially with 512-bit vectors. It costs more ALU throughput even when the vector constant has to be loaded for every shuffle, and code that uses some shuffles often uses a lot of shuffles, to the point of easily being a bottleneck.

Instruction costs from https://uops.info/

vpbroadcastq %xmm, %zmm Zen 4: 1c latency, 1 uop, 2 cycles on ports FP12. Intel: 1 uop p5, latency 3c
vpshufb zmm Zen 4: 2c latency, 1 uop, 2 cycles on ports FP12. Intel: 1 uop p5, latency 1c. (2 uops with a memory source; Intel can't micro-fuse the load.)
vpermb zmm Zen 4: 6c latency, 1 uop, 2 cycles on ports FP12. Intel: 1 uop p5, latency 3c.

Broadcast vec to vec costs a uop for a shuffle execution port. Zen 4 runs vpbroadcastq %xmm, %zmm with 1 cycle latency, although only 1/clock throughput despite being a single uop for ports FP12, so it does need to execute twice to produce both 256-bit halves of the output.
On Intel it's a lane-crossing shuffle costing 3 cycle latency, with the same cost as vpermb

If we were broadcasting from a memory source, that then vpshufb would be a nice win everywhere as broadcast loads are fully handled in the load execution unit, no ALU needed. So vmovq mem, %zmm costs the same as vpbroadcastq mem, %zmm.
Even on Zen 4, https://uops.info/ reports 0.5c throughput for vpbroadcastq zmm, m64, so it's not taking 2 cycles in a load port to write both 256-bit halves of the result.

A GPR source is different on Zen 4: vpbroadcastq %r64, %x/y/zmm is 2 uops for the front-end/ROB, 1c throughput needing FP1 (xmm/ymm) or FP12 (zmm), vs. vmovq %r64, %xmm being 1 uop (no SIMD execution port specified, so maybe just an integer port; uops.info doesn't break down integer ports for AMD).

Future Intel E-cores that implement AVX10 256-bit with multiple 128-bit uops might benefit more from avoiding lane-crossing shuffles like vpermb, although their current AVX2 vpermd and vperm2i128 is 2 uops / 6c latency, not bad like Zen 1 and Bulldozer were (vperm2i128 is 8 uops on Zen 1, vpermd ymm is 3).

Validark commented 4 days ago

So it's very situational and shouldn't be the default; that would pessimize code that was expecting a single vpermb in a loop for throughput.

I guess I wish I could have my cake and eat it too. Can't we figure out whether we're in a tight serial dependency chain and optimize for latency in such cases and optimize for throughput otherwise? I think I have heard that other optimizations make a decision like this, but I guess I don't know.

pcordes commented 3 days ago

I guess I wish I could have my cake and eat it too. Can't we figure out whether we're in a tight serial dependency chain and optimize for latency in such cases and optimize for throughput otherwise?

I'd love that, too, but LLVM doesn't seem to be that smart. It will avoid / break false-dependencies if they're loop-carried (e.g. xorps-zero before cvtsi2ss %r32, %xmm) so at least one optimization pass does detect that. But LLVM in general seems to always favour latency over front-end throughput, at least in scalar code.

Even in loop prologues, it'll use LEA + a separate ADD (for a total of 2 cycle latency on Skylake) instead of using a 3-component LEA addressing mode like [rdi + rcx*4 + 12] (3 cycle latency on Skylake, and runs on fewer ports. On some other uarches, only 2c latency so break-even for latency and better in every other way.) 1 uop for the front-end / ROB has advantages when latency isn't the bottleneck, both for OoO window.

For a slightly different example, consider (Godbolt)

void foo(int *dst, int *src){
    int *enddst = dst+10240;
    do {
        *dst++ = *(src++)  * 19;
    }while(dst != enddst);
}

I was hoping to get an LEA in the prologue and pointer increments, but it still transforms to use a loop counter and indexed addressing modes (even if unrolling, when that's clearly worse on Intel SnB-family since it defeats micro-fusion, even on Haswell and later for insn that aren't like op mem, rmw_dest_reg, e.g. SSE2 paddd mem, xmm can stay micro-fused, but AVX2 vpaddd mem, xmm, xmm can't: https://stackoverflow.com/questions/26046634/micro-fusion-and-addressing-modes ).

Anyway, that loop can run at faster on Skylake if compiled with -Oz (using imul) than with -O3 -fno-vectorize -fno-unroll-loops (using load + 2xLEA + store). And unrolling could let the -Oz version get to 1/clock on Skylake (4-wide issue/rename), while the -O3 version could only approach that with large unroll factors.

# clang -Oz -march=skylake
# 5 uop loop (or 4 uops for Zen where 1/clock imul is the bottleneck)
foo:
        xorl    %eax, %eax
.LBB0_1:
        imull   $19, (%rsi,%rax), %ecx   # will un-laminate to 2 uops for issue/rename on Intel P-cores, still 1 on AMD
        movl    %ecx, (%rdi,%rax)        # stays micro-fused on Haswell and later
        addq    $4, %rax
        cmpq    $40960, %rax    # macro-fused with JCC
        jne     .LBB0_1
        retq

# clang -march=skylake -O3 -Wall -fno-tree-vectorize -fno-unroll-loops
# 6 uop loop, only Alder Lake / Zen 3 can run this at 1/clock
foo:
        xorl    %eax, %eax
.LBB0_1:
        movl    (%rsi,%rax), %ecx
        leal    (%rcx,%rcx,8), %edx
        leal    (%rcx,%rdx,2), %ecx
        movl    %ecx, (%rdi,%rax)
        addq    $4, %rax
        cmpq    $40960, %rax
        jne     .LBB0_1
        retq

So it optimized for latency even though there's only this one operation between load and store, ignoring the front-end bottleneck. Adding a +8 still makes it choose a separate add $8 instead of making it part of an LEA addressing mode. That would make one of the LEAs "complex", 3 cycle latency and thus only running on port 1 (Skylake's only port that has a multi-cycle integer ALU). But Skylake only has 2/clock LEA throughput even for simple LEA, so we were already up against that back-end execution unit bottleneck with 2 LEAs (each p15), same as with one IMUL (p1).

To be slightly fair, later uarches are common now, and Ice Lake onward and Zen have wider front-ends and 4/clock throughput for simple LEAs, 2/clock for complex LEAs, although the rules for what counts as simple vs. complex differ by uarch. On Ice Lake, all LEAs have 1 cycle latency (so a separate ADD instruction is always worse), but what makes an LEA "complex" (p15) is having a scale factor other than 1 . lea 123(%rdi, %rsi), %eax is still p0156 on Ice Lake. Alder Lake is basically the same but more execution ports: 3 that can handle scaled-index, 5 that can handle simple.

On Zen 4, 3-component (base, index, and displacement) LEAs are 2 uops, with one of the register inputs not being needed until the second uop (https://uops.info/ shows the latency as [1:2], meaning it's different from different inputs). So manually splitting to LEA + ADD just costs more code-size and requires one of the register inputs sooner.

So we should stop splitting LEAs since Skylake-derived uarches are not the most important anymore, and it was a tradeoff even there, not a pure win. (Except with -mtune=skylake if that's possible, ideally only when latency or port 1 throughput are bottlenecks.) I guess I should copy this to a new issue about that.

Anyway, back to the point of this: in a loop where throughput is the only meaningful bottleneck, Clang still chose to favour latency, hurting front-end throughput. (And break-even for back-end execution port throughput.) This example could have vectorized with vpmulld or shifts/adds, but I could have put something like a tzcnt in the loop that doesn't auto-vectorize easily without AVX-512. (A DeBruijn sequence could work well for 16-bit elements, IIRC.) Anyway, point is, code like this can happen as part of throughput-bound loops that won't get vectorized, so I don't think it's too artificial to look at -fno-vectorize output.

According to https://uica.uops.info/ (copy/paste just the loop, not the whole function), Skylake is bottlenecked only by the front-end on both these loops, so the imul one runs at 1.33c / iter, the lea one at 1.5c / iter, saturating issue/rename with 4 uops per clock fed from the uop cache.

With unrolling, we'd get to the 1/clock imul bottleneck for that version, but with 4 uops just for the loop body we'd never quite get to 1/clock for the LEA version. It's also less "friendly" to another logical core sharing the physical core, running more uops for the same work.

Surprisingly, 5-wide Ice Lake does worse here than 4-wide Skylake; it looks from the trace like it keeps losing cycles on the loop-carried dep chain (add $4, %rax) by having other uops (usually the cmp/jcc of the previous iteration) execute on the port it was scheduled to. Since p15 are occupied with LEAs, leaving the ADD and CMP/JCC uops to compete for p06. Unrolling would fix that.

llvm / llvm-project

[AVX-512] Decompose `vpermb` into broadcast+`vpshufb` when possible #114001