[AVX-512] `vmovdqu8` constant memory operand not hoisted from loop

Validark commented 2 months ago

I have real code using a @select statement in Zig like so:

export fn testLoop(source: [*]@Vector(64, u8)) @Vector(64, u8) {
    var cur_src = source;
    var things: @Vector(64, u8) = @splat(0);

    while (true) {
        const str: @Vector(64, u8) = cur_src[0..64][0];

        const slot: u64 = @bitCast(str == @as(@Vector(64, u8), @splat(16))); // this vector of 16's gets hoisted from the loop
        things = @select(u8,
            @as(@Vector(64, u1), @bitCast(slot)) == @as(@Vector(64, u1), @splat(1)),
            @as(@Vector(64, u8), @splat(22)), // this vector of 22's does not get hoisted from the loop
            things);

        cur_src = cur_src[1..];
        if (cur_src[0][0] == 0) break;
    }

    return things;
}

Zen 4 emit:

.LCPI0_1:
        .zero   64,22
.LCPI0_2:
        .byte   16
testLoop:
        vpbroadcastb    zmm1, byte ptr [rip + .LCPI0_2]
        add     rdi, 64
        vpxor   xmm0, xmm0, xmm0
.LBB0_1:
        vpcmpeqb        k1, zmm1, zmmword ptr [rdi - 64]
        cmp     byte ptr [rdi], 0
        lea     rdi, [rdi + 64]
        vmovdqu8        zmm0 {k1}, zmmword ptr [rip + .LCPI0_1]; why aren't we doing a `vpbroadcastb` before the loop?
        jne     .LBB0_1
        ret

Zig godbolt link

In my opinion, it should be:

.LCPI0_1:
        .byte   22
.LCPI0_2:
        .byte   16
testLoop:
        vpbroadcastb    zmm1, byte ptr [rip + .LCPI0_2]
+       vpbroadcastb    zmm2, byte ptr [rip + .LCPI0_1]
        add     rdi, 64
        vpxor   xmm0, xmm0, xmm0
.LBB0_1:
        vpcmpeqb        k1, zmm1, zmmword ptr [rdi - 64]
        cmp     byte ptr [rdi], 0
        lea     rdi, [rdi + 64]
-       vmovdqu8        zmm0 {k1}, zmmword ptr [rip + .LCPI0_1]
+       vmovdqu8        zmm0 {k1}, zmm2
        jne     .LBB0_1
        ret

Here is the LLVM IR emitted by Zig:

(I ran zig build-obj ./src/llvmbadloop.zig -O ReleaseFast -target x86_64-linux -mcpu znver4 -femit-llvm-ir -fstrip, then erased the unnecessary target triple and other config stuff)

define dso_local <64 x i8> @testLoop(ptr nocapture nonnull readonly align 64 %0) local_unnamed_addr #0 {
Entry:
  %scevgep = getelementptr i8, ptr %0, i64 64
  br label %Loop

Loop:                                             ; preds = %Loop, %Entry
  %lsr.iv = phi ptr [ %scevgep5, %Loop ], [ %scevgep, %Entry ]
  %.sroa.03.0 = phi <64 x i8> [ zeroinitializer, %Entry ], [ %3, %Loop ]
  %scevgep6 = getelementptr i8, ptr %lsr.iv, i64 -64
  %1 = load <64 x i8>, ptr %scevgep6, align 64
  %2 = icmp eq <64 x i8> %1, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
  %3 = select <64 x i1> %2, <64 x i8> <i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22>, <64 x i8> %.sroa.03.0
  %4 = load i8, ptr %lsr.iv, align 64
  %5 = icmp eq i8 %4, 0
  %scevgep5 = getelementptr i8, ptr %lsr.iv, i64 64
  br i1 %5, label %Then, label %Loop

Then:                                             ; preds = %Loop
  ret <64 x i8> %3
}

LLVM IR dump Godbolt link

llvmbot commented 2 months ago

@llvm/issue-subscribers-backend-x86

Author: Niles Salter (Validark)

I have real code using a `@select` statement in Zig like so: ```zig export fn testLoop(source: [*]@Vector(64, u8)) @Vector(64, u8) { var cur_src = source; var things: @Vector(64, u8) = @splat(0); while (true) { const str: @Vector(64, u8) = cur_src[0..64][0]; const slot: u64 = @bitCast(str == @as(@Vector(64, u8), @splat(16))); // this vector of 16's gets hoisted from the loop things = @select(u8, @as(@Vector(64, u1), @bitCast(slot)) == @as(@Vector(64, u1), @splat(1)), @as(@Vector(64, u8), @splat(22)), // this vector of 22's does not get hoisted from the loop things); cur_src = cur_src[1..]; if (cur_src[0][0] == 0) break; } return things; } ``` Zen 4 emit: ```asm .LCPI0_1: .zero 64,22 .LCPI0_2: .byte 16 testLoop: vpbroadcastb zmm1, byte ptr [rip + .LCPI0_2] add rdi, 64 vpxor xmm0, xmm0, xmm0 .LBB0_1: vpcmpeqb k1, zmm1, zmmword ptr [rdi - 64] cmp byte ptr [rdi], 0 lea rdi, [rdi + 64] vmovdqu8 zmm0 {k1}, zmmword ptr [rip + .LCPI0_1]; why aren't we doing a `vpbroadcastb` before the loop? jne .LBB0_1 ret ``` [Zig godbolt link](https://zig.godbolt.org/z/4aT4o17b5) In my opinion, it should be: ```diff .LCPI0_1: .byte 22 .LCPI0_2: .byte 16 testLoop: vpbroadcastb zmm1, byte ptr [rip + .LCPI0_2] + vpbroadcastb zmm2, byte ptr [rip + .LCPI0_1] add rdi, 64 vpxor xmm0, xmm0, xmm0 .LBB0_1: vpcmpeqb k1, zmm1, zmmword ptr [rdi - 64] cmp byte ptr [rdi], 0 lea rdi, [rdi + 64] - vmovdqu8 zmm0 {k1}, zmmword ptr [rip + .LCPI0_1] + vmovdqu8 zmm0 {k1}, zmm2 jne .LBB0_1 ret ``` Here is the LLVM IR emitted by Zig: (I ran `zig build-obj ./src/llvmbadloop.zig -O ReleaseFast -target x86_64-linux -mcpu znver4 -femit-llvm-ir -fstrip`, then erased the unnecessary `target triple` and other config stuff) ```llvm define dso_local <64 x i8> @testLoop(ptr nocapture nonnull readonly align 64 %0) local_unnamed_addr #0 { Entry: %scevgep = getelementptr i8, ptr %0, i64 64 br label %Loop Loop: ; preds = %Loop, %Entry %lsr.iv = phi ptr [ %scevgep5, %Loop ], [ %scevgep, %Entry ] %.sroa.03.0 = phi <64 x i8> [ zeroinitializer, %Entry ], [ %3, %Loop ] %scevgep6 = getelementptr i8, ptr %lsr.iv, i64 -64 %1 = load <64 x i8>, ptr %scevgep6, align 64 %2 = icmp eq <64 x i8> %1, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16> %3 = select <64 x i1> %2, <64 x i8> <i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22, i8 22>, <64 x i8> %.sroa.03.0 %4 = load i8, ptr %lsr.iv, align 64 %5 = icmp eq i8 %4, 0 %scevgep5 = getelementptr i8, ptr %lsr.iv, i64 64 br i1 %5, label %Then, label %Loop Then: ; preds = %Loop ret <64 x i8> %3 } ``` [LLVM IR dump Godbolt link](https://godbolt.org/z/TerKEGK5f)

tgymnich commented 2 months ago

MachineLICM is not applied currently because the instruction depends on a PHI node (%v2) defined inside the loop.

; predecessors: %bb.0, %bb.1
  successors: %bb.2(0x04000000), %bb.1(0x7c000000); %bb.2(3.12%), %bb.1(96.88%)

  %1:gr64 = PHI %0:gr64, %bb.0, %4:gr64, %bb.1
  %2:vr512 = PHI %6:vr512, %bb.0, %3:vr512, %bb.1
  %7:vr512 = VMOVDQA64Zrm %1:gr64, 1, $noreg, -64, $noreg :: (load (s512) from %ir.scevgep6)
  %8:vk64wm = VPCMPBZrri killed %7:vr512, %9:vr512, 0
  %3:vr512 = VMOVDQU8Zrmk %2:vr512(tied-def 0), killed %8:vk64wm, $rip, 1, $noreg, %const.1, $noreg :: (load (s512) from constant-pool)
  %4:gr64 = ADD64ri32 %1:gr64(tied-def 0), 64, implicit-def dead $eflags
  CMP8mi %1:gr64, 1, $noreg, 0, $noreg, 0, implicit-def $eflags :: (load (s8) from %ir.lsr.iv, align 64)
  JCC_1 %bb.1, 5, implicit $eflags
  JMP_1 %bb.2

RKSimon commented 2 months ago

Its simpler than that - we don't allow unfolding VMOVDQU8Zrmk instructions back to VMOVDQU8Zrrk to prevent illegal memory access (bounded by the predicate mask).

{X86::VMOVDQU8Zrrk, X86::VMOVDQU8Zrmk, TB_NO_REVERSE},

@KanRobert how easy would it be to safely allow constant pool loads to be unfolded?

llvm / llvm-project

[AVX-512] `vmovdqu8` constant memory operand not hoisted from loop #104560