llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
29.3k stars 12.11k forks source link

[x86][znver3] Dead 32-byte constant accessed via `vbroadcasti128` not eliminated from assembly #110305

Open Validark opened 2 months ago

Validark commented 2 months ago

I had this code: (Godbolt link)

const std = @import("std");
const builtin = @import("builtin");

export fn foo(x: @Vector(32, u8)) @TypeOf(x) {
    const vec: @TypeOf(x) = comptime std.simd.repeat(@sizeOf(@TypeOf(x)), [16]u8{4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0});
    return vpshufb(vec, x >> @splat(4)) + @select(u8, x == @as(@TypeOf(x), @splat(0)), vpshufb(vec, x), @as(@TypeOf(x), @splat(0)));
}

fn vpshufb(table: anytype, indices: @TypeOf(table)) @TypeOf(table) {
    if (@inComptime()) {
        var result: @TypeOf(indices) = undefined;
        for (0..@bitSizeOf(@TypeOf(indices)) / 8) |i| {
            const index = indices[i];
            result[i] = if (index >= 0x80) 0 else table[index % (@bitSizeOf(@TypeOf(table)) / 8)];
        }

        return result;
    }

    const methods = struct {
        extern fn @"llvm.x86.avx512.pshuf.b.512"(@Vector(64, u8), @Vector(64, u8)) @Vector(64, u8);
        extern fn @"llvm.x86.avx2.pshuf.b"(@Vector(32, u8), @Vector(32, u8)) @Vector(32, u8);
        extern fn @"llvm.x86.ssse3.pshuf.b.128"(@Vector(16, u8), @Vector(16, u8)) @Vector(16, u8);
    };

    return switch (@TypeOf(table)) {
        @Vector(64, u8) => if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx512bw)) methods.@"llvm.x86.avx512.pshuf.b.512"(table, indices) else @compileError("CPU target lacks support for vpshufb512"),
        @Vector(32, u8) => if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) methods.@"llvm.x86.avx2.pshuf.b"(table, indices) else @compileError("CPU target lacks support for vpshufb256"),
        @Vector(16, u8) => if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .ssse3)) methods.@"llvm.x86.ssse3.pshuf.b.128"(table, indices) else @compileError("CPU target lacks support for vpshufb128"),
        else => @compileError(std.fmt.comptimePrint("Invalid argument type passed to vpshufb: {}\n", .{@TypeOf(table)})),
    };
}

Produce this ASM:

.LCPI0_0:
        .zero   32,15
        .byte   4
        .byte   3
        .byte   2
        .byte   2
        .byte   1
        .byte   1
        .byte   1
        .byte   1
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   4
        .byte   3
        .byte   2
        .byte   2
        .byte   1
        .byte   1
        .byte   1
        .byte   1
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
.LCPI0_2:
        .byte   4
        .byte   3
        .byte   2
        .byte   2
        .byte   1
        .byte   1
        .byte   1
        .byte   1
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
foo:
        vbroadcasti128  ymm2, xmmword ptr [rip + .LCPI0_2]
        vpsrlw  ymm1, ymm0, 4
        vpand   ymm1, ymm1, ymmword ptr [rip + .LCPI0_0]
        vpxor   xmm3, xmm3, xmm3
        vpcmpeqb        ymm3, ymm0, ymm3
        vpshufb ymm0, ymm2, ymm0
        vpshufb ymm1, ymm2, ymm1
        vpand   ymm0, ymm3, ymm0
        vpaddb  ymm0, ymm0, ymm1
        ret

Looks like LCPI0_1 was sort of put there but our code ultimately decided to use vbroadcasti128 instead so only the 16-byte version in LCPI0_2 was necessary.

Interestingly, when I flip the vpshufb(vec, x >> @splat(4)) + @select(u8, x == @as(@TypeOf(x), @splat(0)), vpshufb(vec, x), @as(@TypeOf(x), @splat(0))); to be @select(u8, x == @as(@TypeOf(x), @splat(0)), vpshufb(vec, x), @as(@TypeOf(x), @splat(0))) + vpshufb(vec, x >> @splat(4));, the problem disappears:

.LCPI0_1:
        .zero   32,15
.LCPI0_2:
        .byte   4
        .byte   3
        .byte   2
        .byte   2
        .byte   1
        .byte   1
        .byte   1
        .byte   1
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
foo:
        vbroadcasti128  ymm2, xmmword ptr [rip + .LCPI0_2]
        vpxor   xmm1, xmm1, xmm1
        vpcmpeqb        ymm1, ymm0, ymm1
        vpshufb ymm3, ymm2, ymm0
        vpsrlw  ymm0, ymm0, 4
        vpand   ymm0, ymm0, ymmword ptr [rip + .LCPI0_1]
        vpand   ymm1, ymm1, ymm3
        vpshufb ymm0, ymm2, ymm0
        vpaddb  ymm0, ymm1, ymm0
        ret

Here is a dump of the offending code via zig build-obj ./src/llvm_code.zig -O ReleaseFast -target x86_64-linux -mcpu znver3 --verbose-llvm-ir -fstrip >llvm_code.ll 2>&1

; ModuleID = 'llvm_code'
source_filename = "llvm_code"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-musl"

%Target.Cpu.Feature.Set = type { [5 x i64] }
%Target.Cpu.Model = type { { ptr, i64 }, { ptr, i64 }, %Target.Cpu.Feature.Set }
%Target.Cpu = type { ptr, %Target.Cpu.Feature.Set, i6, [7 x i8] }
%Target.DynamicLinker = type { [255 x i8], i8 }

@builtin.zig_backend = internal unnamed_addr constant i64 2, align 8
@Target.Cpu.Feature.Set.empty = internal unnamed_addr constant %Target.Cpu.Feature.Set zeroinitializer, align 8
@Target.x86.cpu.znver3 = internal unnamed_addr constant %Target.Cpu.Model { { ptr, i64 } { ptr getelementptr inbounds (i8, ptr @__anon_225, i64 0), i64 6 }, { ptr, i64 } { ptr getelementptr inbounds (i8, ptr @__anon_225, i64 0), i64 6 }, %Target.Cpu.Feature.Set { [5 x i64] [i64 148023677034692784, i64 110359908924890900, i64 8266358215819756484, i64 0, i64 0] } }, align 8
@__anon_225 = internal unnamed_addr constant [7 x i8] c"znver3\00", align 1
@builtin.cpu = internal unnamed_addr constant %Target.Cpu { ptr getelementptr inbounds (i8, ptr @Target.x86.cpu.znver3, i64 0), %Target.Cpu.Feature.Set { [5 x i64] [i64 152527276662137072, i64 119367108179631892, i64 8842824431321580484, i64 0, i64 0] }, i6 -22, [7 x i8] undef }, align 8
@start.simplified_logic = internal unnamed_addr constant i1 false, align 1
@builtin.output_mode = internal unnamed_addr constant i2 -2, align 1
@Target.DynamicLinker.none = internal unnamed_addr constant %Target.DynamicLinker { [255 x i8] undef, i8 0 }, align 1

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define dso_local <32 x i8> @foo2(<32 x i8> %0) #0 {
1:
  %2 = zext <32 x i3> <i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4> to <32 x i8>
  %3 = lshr <32 x i8> %0, %2
  %4 = call fastcc <32 x i8> @llvm_code.vpshufb__anon_1530(<32 x i8> <i8 4, i8 3, i8 2, i8 2, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 4, i8 3, i8 2, i8 2, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i8> %3)
  %5 = icmp eq <32 x i8> %0, zeroinitializer
  %6 = call fastcc <32 x i8> @llvm_code.vpshufb__anon_1530(<32 x i8> <i8 4, i8 3, i8 2, i8 2, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 4, i8 3, i8 2, i8 2, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i8> %0)
  %7 = select <32 x i1> %5, <32 x i8> %6, <32 x i8> zeroinitializer
  %8 = add nuw <32 x i8> %4, %7
  ret <32 x i8> %8
}

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc <32 x i8> @llvm_code.vpshufb__anon_1530(<32 x i8> %0, <32 x i8> %1) unnamed_addr #0 {
2:
  %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %0, <32 x i8> %1)
  ret <32 x i8> %3
}

; Function Attrs: nounwind uwtable
declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %0, <32 x i8> %1) #1

attributes #0 = { nounwind uwtable nosanitize_coverage skipprofile "frame-pointer"="none" "target-cpu"="znver3" "target-features"="-16bit-mode,-32bit-mode,+64bit,+adx,+aes,+allow-light-256-bit,-amx-bf16,-amx-complex,-amx-fp16,-amx-int8,-amx-tile,+avx,-avx10.1-256,-avx10.1-512,+avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,+bmi,+bmi2,-branch-hint,+branchfusion,-ccmp,-cf,-cldemote,+clflushopt,+clwb,+clzero,+cmov,-cmpccxadd,+crc32,+cx16,+cx8,-egpr,-enqcmd,-ermsb,-evex512,+f16c,-false-deps-getmant,-false-deps-lzcnt-tzcnt,-false-deps-mulc,-false-deps-mullq,-false-deps-perm,-false-deps-popcnt,-false-deps-range,-fast-11bytenop,+fast-15bytenop,-fast-7bytenop,+fast-bextr,-fast-dpwssd,-fast-gather,-fast-hops,+fast-imm16,+fast-lzcnt,+fast-movbe,+fast-scalar-fsqrt,+fast-scalar-shift-masks,-fast-shld-rotate,-fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle,+fast-vector-fsqrt,-fast-vector-shift-masks,-faster-shift-than-shuffle,+fma,-fma4,+fsgsbase,+fsrm,+fxsr,-gfni,-harden-sls-ijmp,-harden-sls-ret,-hreset,-idivl-to-divb,+idivq-to-divl,-inline-asm-use-gpr32,+invpcid,-kl,-lea-sp,-lea-uses-ag,-lvi-cfi,-lvi-load-hardening,-lwp,+lzcnt,+macrofusion,+mmx,+movbe,-movdir64b,-movdiri,+mwaitx,-ndd,-nf,-no-bypass-delay,-no-bypass-delay-blend,-no-bypass-delay-mov,-no-bypass-delay-shuffle,+nopl,-pad-short-functions,+pclmul,-pconfig,+pku,+popcnt,-ppx,-prefer-128-bit,-prefer-256-bit,-prefer-mask-registers,-prefer-movmsk-over-vtest,-prefer-no-gather,-prefer-no-scatter,-prefetchi,+prfchw,-ptwrite,-push2pop2,-raoint,+rdpid,+rdpru,+rdrnd,+rdseed,-retpoline,-retpoline-external-thunk,-retpoline-indirect-branches,-retpoline-indirect-calls,-rtm,+sahf,+sbb-dep-breaking,-serialize,-seses,-sgx,+sha,-sha512,-shstk,-slow-3ops-lea,-slow-incdec,-slow-lea,-slow-pmaddwd,-slow-pmulld,+slow-shld,-slow-two-mem-ops,-slow-unaligned-mem-16,-slow-unaligned-mem-32,-sm3,-sm4,-soft-float,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,-sse-unaligned-mem,+ssse3,-tagged-globals,-tbm,-tsxldtrk,-tuning-fast-imm-vector-shift,-uintr,-use-glm-div-sqrt-costs,-use-slm-arith-costs,-usermsr,+vaes,+vpclmulqdq,+vzeroupper,-waitpkg,+wbnoinvd,-widekl,+x87,-xop,+xsave,+xsavec,+xsaveopt,+xsaves,-zu" }
attributes #1 = { nounwind uwtable "frame-pointer"="none" "target-cpu"="znver3" "target-features"="-16bit-mode,-32bit-mode,+64bit,+adx,+aes,+allow-light-256-bit,-amx-bf16,-amx-complex,-amx-fp16,-amx-int8,-amx-tile,+avx,-avx10.1-256,-avx10.1-512,+avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,+bmi,+bmi2,-branch-hint,+branchfusion,-ccmp,-cf,-cldemote,+clflushopt,+clwb,+clzero,+cmov,-cmpccxadd,+crc32,+cx16,+cx8,-egpr,-enqcmd,-ermsb,-evex512,+f16c,-false-deps-getmant,-false-deps-lzcnt-tzcnt,-false-deps-mulc,-false-deps-mullq,-false-deps-perm,-false-deps-popcnt,-false-deps-range,-fast-11bytenop,+fast-15bytenop,-fast-7bytenop,+fast-bextr,-fast-dpwssd,-fast-gather,-fast-hops,+fast-imm16,+fast-lzcnt,+fast-movbe,+fast-scalar-fsqrt,+fast-scalar-shift-masks,-fast-shld-rotate,-fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle,+fast-vector-fsqrt,-fast-vector-shift-masks,-faster-shift-than-shuffle,+fma,-fma4,+fsgsbase,+fsrm,+fxsr,-gfni,-harden-sls-ijmp,-harden-sls-ret,-hreset,-idivl-to-divb,+idivq-to-divl,-inline-asm-use-gpr32,+invpcid,-kl,-lea-sp,-lea-uses-ag,-lvi-cfi,-lvi-load-hardening,-lwp,+lzcnt,+macrofusion,+mmx,+movbe,-movdir64b,-movdiri,+mwaitx,-ndd,-nf,-no-bypass-delay,-no-bypass-delay-blend,-no-bypass-delay-mov,-no-bypass-delay-shuffle,+nopl,-pad-short-functions,+pclmul,-pconfig,+pku,+popcnt,-ppx,-prefer-128-bit,-prefer-256-bit,-prefer-mask-registers,-prefer-movmsk-over-vtest,-prefer-no-gather,-prefer-no-scatter,-prefetchi,+prfchw,-ptwrite,-push2pop2,-raoint,+rdpid,+rdpru,+rdrnd,+rdseed,-retpoline,-retpoline-external-thunk,-retpoline-indirect-branches,-retpoline-indirect-calls,-rtm,+sahf,+sbb-dep-breaking,-serialize,-seses,-sgx,+sha,-sha512,-shstk,-slow-3ops-lea,-slow-incdec,-slow-lea,-slow-pmaddwd,-slow-pmulld,+slow-shld,-slow-two-mem-ops,-slow-unaligned-mem-16,-slow-unaligned-mem-32,-sm3,-sm4,-soft-float,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,-sse-unaligned-mem,+ssse3,-tagged-globals,-tbm,-tsxldtrk,-tuning-fast-imm-vector-shift,-uintr,-use-glm-div-sqrt-costs,-use-slm-arith-costs,-usermsr,+vaes,+vpclmulqdq,+vzeroupper,-waitpkg,+wbnoinvd,-widekl,+x87,-xop,+xsave,+xsavec,+xsaveopt,+xsaves,-zu" }

!llvm.module.flags = !{}
llvmbot commented 2 months ago

@llvm/issue-subscribers-backend-x86

Author: Niles Salter (Validark)

I had this code: ([Godbolt link](https://zig.godbolt.org/#z:OYLghAFBqd5QCxAYwPYBMCmBRdBLAF1QCcAaPECAMzwBtMA7AQwFtMQByARg9KtQYEAysib0QXACx8BBAKoBnTAAUAHpwAMvAFYgAzKVpMGoAF55gpJfWQE8Ayo3QBhVLQCuLBiADsBpwAyeAyYAHKeAEaYxCAAbKQADqgKhPYMrh5evgZJKXYCQSHhLFEx8daYtmlCBEzEBBme3n5WmDb5DDV1BIVhkdFxVrX1jVktCsM9wX0lA7EAlFao7sTI7BxoDBMA1BPo2wCkegAi2wACeCxJ9RAHAEx3e/d380cAQgcaAIKbOxHudDsDEOJ3Ol2uBFuD3%2BgOCz1eeg%2B30%2BX0wqgh2yowP4qAg6nOADVKkRiBA9HdSNt3AAOebzc4AFQAnglMAB5Kh4%2BkHHxIr7bAXbX4EbYAN0qIEZLPZnNU3NBaCudjYuwI6AAdCkWBriJhWUxIWcUqYZRAzszWRyuYtDgBWN5cWIHW3HGk8t7SAwUilcUi%2B/2kDSB4NB0MhnnHBF8wXbXUEFbA0UJBQIdxUCIQcXISmqEHYI7Yc4KBJGSGSOmHO5vIttYkQGk5kHHI6nM5MBRmi2muWUo0lg0QDR0ylJlNpjNZnM2tsd83Sq09ov9yFDunvFERjffLFi5Op9MQWoRcTbYxMgjSynBfBrBSSueWzlH%2BgVh%2Bm5%2BYbm8lExvBUbZmsErhKpcmAQBW7o/jGgqinUsaYAo7i0AQ95dla154Le8qnO4DBYDQIToOu3zQYK/DEABGjquqZwRIQQh4CaVpvuheGYQhFYAPTbLShw%2BM4eA8s4fHRqR0HCts15ok2klsbezpvIJLrEfyYkxrqiHIQpSmnC2kn/hAUm5gWekaKo1JDtsGjbG0SjbB%2B2l4dJ9y2gBtH0YxposU%2BTDHp%2B9LcbSzrNoiUGkZuyIkWJcYJvBmkECpMYRV8YVCgIOxsAQCAYAoMkTMQ7i2CJqUxmiBDRNiwJnM8tC0KKLDquZsTqkwoqqLaXB3Oqe5puqETqh1dzPGaRK2CQECxNIVK0r2o0khNU00q%2Bc3jZNlJLYlYllRVmJVTVdUNU1LVtV1PVUH1w1nCtpLkutM2EsS423dNy2PTdFIvZtpHbcQlXnPt9WNdSzUKKDmB6N1Y7nf1nXUpd10QI6d3TgjSMvfSV1vYj8SfaFUUCpueMpfj8Hxr9uwAO6EMgCBuWhPl%2BRB34kzGmNjaSa0vU2Bb6QBioJMqmCqhqDJ1MAmAEEDzVUJgBorJgQgSwAEu2EAwshwTqsgCTuOqMtyxplLHe1nURBTFaZdl6AKDRAOHcDxuDZD%2B59QNnXDR%2BV5yRxNm0HZZz83QODEMQ43PM4yhyPZYsS9sRjIAA1rliEJBi5G7lDESDfCpAlYKbPzc9S3c3ohZ/nzqAgSqezqqLxDi5LR362TCvK6r6tAlrOt67LLcKEbrWqC89KWzltsPLVgNHYPp1QxdDyHr59BezePu2ULAeVwkQfYCHYcPBHUfDA3cdMInyfuKnJAiuno77hEdy2k6DyLHnAoF%2BNaPFy2PPlxA/OC2FrXGOjcHbN3lorAgKsOwd01trXW4DDbbE1GDPQFsJZWxttVCeB0pYoKUBDM6rtYYeyXpgFe7EFD0nXucQO9Bd6h1JOHSO0d66x3jknXYl804kAzvfEhL9c4s0FDQn%2BpdaFbx3nvUkNcqAsElgA0CyhiDBEhM8AAkgwWCtA8D7DFp4RgIoLysm2AkdsSh9hED4eOSUkEfDNltM4Bgzwjbum8ovRmEZhypUJnyCMHBFi0E4LaXg3huC8FQJwAAWhYXYyxVhC3uHoPQvAUIcC0MOBAsssAxHAqQBOIBbQQy4FwDQ1IACcegn6xFKbEW0hhOCSF4CwCQGggxhK0KQSJHBeB3iDGkjJpA4CwCQHQ6I5BKBjJiKYAgBUGAJz4ICaId41aaF4HRZgxAmScB4KQDZdQmRsgiNoYkOzeCKjYIINkDBaDbPSbwLA/xgDODEH7M5pAsAsGMMAcQ9yPl4F1FUcUd4/lokqO4cq7zVFtDWYYPAERiAHNcFgWFszLjvPFMQCIyRMDHEwF8kwOiTBrMWFQIwwAFAEjwJgCmbJWRhN2fwQQIgxDsCkDIQQigVDqD%2BboX0RhiXmEsDoiId5ICLFQALNIIKAC0bJtgACVaztkwAAMXbCKGVx9Y5NQAPqTRlUS9wuYZUsHgS2UwWjogpK6ZilRWAxV5IqFUBwEAnCjG8FwakpBAjTGKKUEAkhfS5FSAID1EhvUho6L0f1Awg2tHaNUSY4avUJuJEm7oMb%2BgxHjRMboKbvV5vqFm2YOauCLAUPEtY%2BhAnBNCbC7p2whX2TmQnACuBCC8KSWg1JJLFhZKYDkygiwCmSGatSCptTJDJIqVUikQSOBNNIC00p7SG2cF6SAfpfaGkcDuPWv53Te33OHJilIDhJBAA%3D%3D%3D)) ```zig const std = @import("std"); const builtin = @import("builtin"); export fn foo(x: @Vector(32, u8)) @TypeOf(x) { const vec: @TypeOf(x) = comptime std.simd.repeat(@sizeOf(@TypeOf(x)), [16]u8{4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0}); return vpshufb(vec, x >> @splat(4)) + @select(u8, x == @as(@TypeOf(x), @splat(0)), vpshufb(vec, x), @as(@TypeOf(x), @splat(0))); } fn vpshufb(table: anytype, indices: @TypeOf(table)) @TypeOf(table) { if (@inComptime()) { var result: @TypeOf(indices) = undefined; for (0..@bitSizeOf(@TypeOf(indices)) / 8) |i| { const index = indices[i]; result[i] = if (index >= 0x80) 0 else table[index % (@bitSizeOf(@TypeOf(table)) / 8)]; } return result; } const methods = struct { extern fn @"llvm.x86.avx512.pshuf.b.512"(@Vector(64, u8), @Vector(64, u8)) @Vector(64, u8); extern fn @"llvm.x86.avx2.pshuf.b"(@Vector(32, u8), @Vector(32, u8)) @Vector(32, u8); extern fn @"llvm.x86.ssse3.pshuf.b.128"(@Vector(16, u8), @Vector(16, u8)) @Vector(16, u8); }; return switch (@TypeOf(table)) { @Vector(64, u8) => if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx512bw)) methods.@"llvm.x86.avx512.pshuf.b.512"(table, indices) else @compileError("CPU target lacks support for vpshufb512"), @Vector(32, u8) => if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) methods.@"llvm.x86.avx2.pshuf.b"(table, indices) else @compileError("CPU target lacks support for vpshufb256"), @Vector(16, u8) => if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .ssse3)) methods.@"llvm.x86.ssse3.pshuf.b.128"(table, indices) else @compileError("CPU target lacks support for vpshufb128"), else => @compileError(std.fmt.comptimePrint("Invalid argument type passed to vpshufb: {}\n", .{@TypeOf(table)})), }; } ``` Produce this ASM: ```asm .LCPI0_0: .zero 32,15 .byte 4 .byte 3 .byte 2 .byte 2 .byte 1 .byte 1 .byte 1 .byte 1 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 4 .byte 3 .byte 2 .byte 2 .byte 1 .byte 1 .byte 1 .byte 1 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .LCPI0_2: .byte 4 .byte 3 .byte 2 .byte 2 .byte 1 .byte 1 .byte 1 .byte 1 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 foo: vbroadcasti128 ymm2, xmmword ptr [rip + .LCPI0_2] vpsrlw ymm1, ymm0, 4 vpand ymm1, ymm1, ymmword ptr [rip + .LCPI0_0] vpxor xmm3, xmm3, xmm3 vpcmpeqb ymm3, ymm0, ymm3 vpshufb ymm0, ymm2, ymm0 vpshufb ymm1, ymm2, ymm1 vpand ymm0, ymm3, ymm0 vpaddb ymm0, ymm0, ymm1 ret ``` Looks like `LCPI0_1` was sort of put there but our code ultimately decided to use `vbroadcasti128` instead so only the 16-byte version in `LCPI0_2` was necessary. Interestingly, when I flip the `vpshufb(vec, x >> @splat(4)) + @select(u8, x == @as(@TypeOf(x), @splat(0)), vpshufb(vec, x), @as(@TypeOf(x), @splat(0)));` to be `@select(u8, x == @as(@TypeOf(x), @splat(0)), vpshufb(vec, x), @as(@TypeOf(x), @splat(0))) + vpshufb(vec, x >> @splat(4));`, the problem disappears: ```asm .LCPI0_1: .zero 32,15 .LCPI0_2: .byte 4 .byte 3 .byte 2 .byte 2 .byte 1 .byte 1 .byte 1 .byte 1 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 foo: vbroadcasti128 ymm2, xmmword ptr [rip + .LCPI0_2] vpxor xmm1, xmm1, xmm1 vpcmpeqb ymm1, ymm0, ymm1 vpshufb ymm3, ymm2, ymm0 vpsrlw ymm0, ymm0, 4 vpand ymm0, ymm0, ymmword ptr [rip + .LCPI0_1] vpand ymm1, ymm1, ymm3 vpshufb ymm0, ymm2, ymm0 vpaddb ymm0, ymm1, ymm0 ret ``` Here is a dump of the offending code via `zig build-obj ./src/llvm_code.zig -O ReleaseFast -target x86_64-linux -mcpu znver3 --verbose-llvm-ir -fstrip >llvm_code.ll 2>&1` ```llvm ; ModuleID = 'llvm_code' source_filename = "llvm_code" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-musl" %Target.Cpu.Feature.Set = type { [5 x i64] } %Target.Cpu.Model = type { { ptr, i64 }, { ptr, i64 }, %Target.Cpu.Feature.Set } %Target.Cpu = type { ptr, %Target.Cpu.Feature.Set, i6, [7 x i8] } %Target.DynamicLinker = type { [255 x i8], i8 } @builtin.zig_backend = internal unnamed_addr constant i64 2, align 8 @Target.Cpu.Feature.Set.empty = internal unnamed_addr constant %Target.Cpu.Feature.Set zeroinitializer, align 8 @Target.x86.cpu.znver3 = internal unnamed_addr constant %Target.Cpu.Model { { ptr, i64 } { ptr getelementptr inbounds (i8, ptr @__anon_225, i64 0), i64 6 }, { ptr, i64 } { ptr getelementptr inbounds (i8, ptr @__anon_225, i64 0), i64 6 }, %Target.Cpu.Feature.Set { [5 x i64] [i64 148023677034692784, i64 110359908924890900, i64 8266358215819756484, i64 0, i64 0] } }, align 8 @__anon_225 = internal unnamed_addr constant [7 x i8] c"znver3\00", align 1 @builtin.cpu = internal unnamed_addr constant %Target.Cpu { ptr getelementptr inbounds (i8, ptr @Target.x86.cpu.znver3, i64 0), %Target.Cpu.Feature.Set { [5 x i64] [i64 152527276662137072, i64 119367108179631892, i64 8842824431321580484, i64 0, i64 0] }, i6 -22, [7 x i8] undef }, align 8 @start.simplified_logic = internal unnamed_addr constant i1 false, align 1 @builtin.output_mode = internal unnamed_addr constant i2 -2, align 1 @Target.DynamicLinker.none = internal unnamed_addr constant %Target.DynamicLinker { [255 x i8] undef, i8 0 }, align 1 ; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile define dso_local <32 x i8> @foo2(<32 x i8> %0) #0 { 1: %2 = zext <32 x i3> <i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4, i3 -4> to <32 x i8> %3 = lshr <32 x i8> %0, %2 %4 = call fastcc <32 x i8> @llvm_code.vpshufb__anon_1530(<32 x i8> <i8 4, i8 3, i8 2, i8 2, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 4, i8 3, i8 2, i8 2, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i8> %3) %5 = icmp eq <32 x i8> %0, zeroinitializer %6 = call fastcc <32 x i8> @llvm_code.vpshufb__anon_1530(<32 x i8> <i8 4, i8 3, i8 2, i8 2, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 4, i8 3, i8 2, i8 2, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i8> %0) %7 = select <32 x i1> %5, <32 x i8> %6, <32 x i8> zeroinitializer %8 = add nuw <32 x i8> %4, %7 ret <32 x i8> %8 } ; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile define internal fastcc <32 x i8> @llvm_code.vpshufb__anon_1530(<32 x i8> %0, <32 x i8> %1) unnamed_addr #0 { 2: %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %0, <32 x i8> %1) ret <32 x i8> %3 } ; Function Attrs: nounwind uwtable declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %0, <32 x i8> %1) #1 attributes #0 = { nounwind uwtable nosanitize_coverage skipprofile "frame-pointer"="none" "target-cpu"="znver3" "target-features"="-16bit-mode,-32bit-mode,+64bit,+adx,+aes,+allow-light-256-bit,-amx-bf16,-amx-complex,-amx-fp16,-amx-int8,-amx-tile,+avx,-avx10.1-256,-avx10.1-512,+avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,+bmi,+bmi2,-branch-hint,+branchfusion,-ccmp,-cf,-cldemote,+clflushopt,+clwb,+clzero,+cmov,-cmpccxadd,+crc32,+cx16,+cx8,-egpr,-enqcmd,-ermsb,-evex512,+f16c,-false-deps-getmant,-false-deps-lzcnt-tzcnt,-false-deps-mulc,-false-deps-mullq,-false-deps-perm,-false-deps-popcnt,-false-deps-range,-fast-11bytenop,+fast-15bytenop,-fast-7bytenop,+fast-bextr,-fast-dpwssd,-fast-gather,-fast-hops,+fast-imm16,+fast-lzcnt,+fast-movbe,+fast-scalar-fsqrt,+fast-scalar-shift-masks,-fast-shld-rotate,-fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle,+fast-vector-fsqrt,-fast-vector-shift-masks,-faster-shift-than-shuffle,+fma,-fma4,+fsgsbase,+fsrm,+fxsr,-gfni,-harden-sls-ijmp,-harden-sls-ret,-hreset,-idivl-to-divb,+idivq-to-divl,-inline-asm-use-gpr32,+invpcid,-kl,-lea-sp,-lea-uses-ag,-lvi-cfi,-lvi-load-hardening,-lwp,+lzcnt,+macrofusion,+mmx,+movbe,-movdir64b,-movdiri,+mwaitx,-ndd,-nf,-no-bypass-delay,-no-bypass-delay-blend,-no-bypass-delay-mov,-no-bypass-delay-shuffle,+nopl,-pad-short-functions,+pclmul,-pconfig,+pku,+popcnt,-ppx,-prefer-128-bit,-prefer-256-bit,-prefer-mask-registers,-prefer-movmsk-over-vtest,-prefer-no-gather,-prefer-no-scatter,-prefetchi,+prfchw,-ptwrite,-push2pop2,-raoint,+rdpid,+rdpru,+rdrnd,+rdseed,-retpoline,-retpoline-external-thunk,-retpoline-indirect-branches,-retpoline-indirect-calls,-rtm,+sahf,+sbb-dep-breaking,-serialize,-seses,-sgx,+sha,-sha512,-shstk,-slow-3ops-lea,-slow-incdec,-slow-lea,-slow-pmaddwd,-slow-pmulld,+slow-shld,-slow-two-mem-ops,-slow-unaligned-mem-16,-slow-unaligned-mem-32,-sm3,-sm4,-soft-float,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,-sse-unaligned-mem,+ssse3,-tagged-globals,-tbm,-tsxldtrk,-tuning-fast-imm-vector-shift,-uintr,-use-glm-div-sqrt-costs,-use-slm-arith-costs,-usermsr,+vaes,+vpclmulqdq,+vzeroupper,-waitpkg,+wbnoinvd,-widekl,+x87,-xop,+xsave,+xsavec,+xsaveopt,+xsaves,-zu" } attributes #1 = { nounwind uwtable "frame-pointer"="none" "target-cpu"="znver3" "target-features"="-16bit-mode,-32bit-mode,+64bit,+adx,+aes,+allow-light-256-bit,-amx-bf16,-amx-complex,-amx-fp16,-amx-int8,-amx-tile,+avx,-avx10.1-256,-avx10.1-512,+avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,+bmi,+bmi2,-branch-hint,+branchfusion,-ccmp,-cf,-cldemote,+clflushopt,+clwb,+clzero,+cmov,-cmpccxadd,+crc32,+cx16,+cx8,-egpr,-enqcmd,-ermsb,-evex512,+f16c,-false-deps-getmant,-false-deps-lzcnt-tzcnt,-false-deps-mulc,-false-deps-mullq,-false-deps-perm,-false-deps-popcnt,-false-deps-range,-fast-11bytenop,+fast-15bytenop,-fast-7bytenop,+fast-bextr,-fast-dpwssd,-fast-gather,-fast-hops,+fast-imm16,+fast-lzcnt,+fast-movbe,+fast-scalar-fsqrt,+fast-scalar-shift-masks,-fast-shld-rotate,-fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle,+fast-vector-fsqrt,-fast-vector-shift-masks,-faster-shift-than-shuffle,+fma,-fma4,+fsgsbase,+fsrm,+fxsr,-gfni,-harden-sls-ijmp,-harden-sls-ret,-hreset,-idivl-to-divb,+idivq-to-divl,-inline-asm-use-gpr32,+invpcid,-kl,-lea-sp,-lea-uses-ag,-lvi-cfi,-lvi-load-hardening,-lwp,+lzcnt,+macrofusion,+mmx,+movbe,-movdir64b,-movdiri,+mwaitx,-ndd,-nf,-no-bypass-delay,-no-bypass-delay-blend,-no-bypass-delay-mov,-no-bypass-delay-shuffle,+nopl,-pad-short-functions,+pclmul,-pconfig,+pku,+popcnt,-ppx,-prefer-128-bit,-prefer-256-bit,-prefer-mask-registers,-prefer-movmsk-over-vtest,-prefer-no-gather,-prefer-no-scatter,-prefetchi,+prfchw,-ptwrite,-push2pop2,-raoint,+rdpid,+rdpru,+rdrnd,+rdseed,-retpoline,-retpoline-external-thunk,-retpoline-indirect-branches,-retpoline-indirect-calls,-rtm,+sahf,+sbb-dep-breaking,-serialize,-seses,-sgx,+sha,-sha512,-shstk,-slow-3ops-lea,-slow-incdec,-slow-lea,-slow-pmaddwd,-slow-pmulld,+slow-shld,-slow-two-mem-ops,-slow-unaligned-mem-16,-slow-unaligned-mem-32,-sm3,-sm4,-soft-float,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,-sse-unaligned-mem,+ssse3,-tagged-globals,-tbm,-tsxldtrk,-tuning-fast-imm-vector-shift,-uintr,-use-glm-div-sqrt-costs,-use-slm-arith-costs,-usermsr,+vaes,+vpclmulqdq,+vzeroupper,-waitpkg,+wbnoinvd,-widekl,+x87,-xop,+xsave,+xsavec,+xsaveopt,+xsaves,-zu" } !llvm.module.flags = !{} ```