Spurious optimization of shl+or routine

Validark commented 4 months ago

I define the following 2 Zig functions: https://zig.godbolt.org/z/3xfc5bjEc

export fn foo(x: u64) u64 {
    var y: u64 = @as(u16, @truncate(x));
    y = (y | (y << 24));
    y = (y | (y << 12));
    return y;
}

export fn bar(x: u16) u64 {
    var y: u64 = x;
    y = (y | (y << 24));
    y = (y | (y << 12));
    return y;
}

Emitting for Neoverse N2, I get:

foo:
        and     x8, x0, #0xffff
        orr     x8, x8, x8, lsl #24
        orr     x0, x8, x8, lsl #12
        ret

bar:
        mov     w8, w0
        ubfiz   x10, x0, #12, #32
        orr     x9, x8, x8, lsl #24
        orr     x8, x10, x8, lsl #36
        orr     x0, x8, x9
        ret

Here is the LLVM IR:

define dso_local i64 @foo(i64 %0) local_unnamed_addr {
Entry:
  %1 = and i64 %0, 65535
  %2 = mul nuw nsw i64 %1, 16777217
  %3 = mul nuw nsw i64 %1, 68719480832
  %4 = or i64 %3, %2
  ret i64 %4
}

declare void @llvm.dbg.value(metadata, metadata, metadata) #1

define dso_local i64 @bar(i16 zeroext %0) local_unnamed_addr {
Entry:
  %1 = zext i16 %0 to i64
  %2 = mul nuw nsw i64 %1, 16777217
  %3 = mul nuw nsw i64 %1, 68719480832
  %4 = or i64 %3, %2
  ret i64 %4
}

Here is the LLVM IR produced by Clang for "equivalent" C code:

define dso_local range(i64 0, 4503599627370496) i64 @foo(i64 noundef %x) local_unnamed_addr {
entry:
  %and = and i64 %x, 65535
  %or = mul nuw nsw i64 %and, 16777217
  %shl1 = mul nuw nsw i64 %and, 68719480832
  %or2 = or i64 %shl1, %or
  ret i64 %or2
}

define dso_local range(i64 0, 4503599627370496) i64 @bar(i16 noundef %x) local_unnamed_addr {
entry:
  %conv = zext i16 %x to i64
  %or = mul nuw nsw i64 %conv, 16777217
  %shl1 = mul nuw nsw i64 %conv, 68719480832
  %or2 = or i64 %shl1, %or
  ret i64 %or2
}

declare void @llvm.dbg.value(metadata, metadata, metadata) #1

And the assembly:

foo:                                    // @foo
        and     x8, x0, #0xffff
        orr     x8, x8, x8, lsl #24
        orr     x0, x8, x8, lsl #12
        ret
bar:                                    // @bar
        and     x8, x0, #0xffff
        orr     x9, x8, x8, lsl #24
        lsl     x8, x8, #12
        bfi     x8, x0, #36, #16
        orr     x0, x8, x9
        ret

On x86, compiling for Zen 4, I get: https://zig.godbolt.org/z/fYzqEWWvx

foo:
        movzx   ecx, di
        movabs  rax, 68719480832
        imul    rax, rcx
        mov     rdx, rcx
        shl     rdx, 24
        or      rdx, rcx
        or      rax, rdx
        ret

bar:
        movabs  rax, 68719480832
        mov     ecx, edi
        mov     rdx, rcx
        shl     rdx, 24
        imul    rax, rcx
        or      rdx, rcx
        or      rax, rdx
        ret

Looks like LLVM is making the decision that a multiply is less expensive than a shl? ~~Also, why can't we use shlx here?~~ I would think we could do:

    mov rax, rdi
    shl rax, 24
    or  rdi, rax
    mov rax, rdi
    shl rax, 12
    or  rax, rdi

topperc commented 4 months ago

Also, why can't we use shlx here?

Isn't shlx restricted to shift amount in a register?

Validark commented 4 months ago

Also, why can't we use shlx here?

Isn't shlx restricted to shift amount in a register?

Yes! You are right! That's why it wasn't giving me shlx! I did not realize that. I updated my suggested assembly section.

llvmbot commented 4 months ago

@llvm/issue-subscribers-backend-x86

Author: Niles Salter (Validark)

I define the following 2 Zig functions: https://zig.godbolt.org/z/3xfc5bjEc ```zig export fn foo(x: u64) u64 { var y: u64 = @as(u16, @truncate(x)); y = (y | (y << 24)); y = (y | (y << 12)); return y; } export fn bar(x: u16) u64 { var y: u64 = x; y = (y | (y << 24)); y = (y | (y << 12)); return y; } ``` Emitting for Neoverse N2, I get: ```asm foo: and x8, x0, #0xffff orr x8, x8, x8, lsl #24 orr x0, x8, x8, lsl #12 ret bar: mov w8, w0 ubfiz x10, x0, #12, #32 orr x9, x8, x8, lsl #24 orr x8, x10, x8, lsl #36 orr x0, x8, x9 ret ``` Here is the LLVM IR: ```llvm define dso_local i64 @foo(i64 %0) local_unnamed_addr { Entry: %1 = and i64 %0, 65535 %2 = mul nuw nsw i64 %1, 16777217 %3 = mul nuw nsw i64 %1, 68719480832 %4 = or i64 %3, %2 ret i64 %4 } declare void @llvm.dbg.value(metadata, metadata, metadata) #1 define dso_local i64 @bar(i16 zeroext %0) local_unnamed_addr { Entry: %1 = zext i16 %0 to i64 %2 = mul nuw nsw i64 %1, 16777217 %3 = mul nuw nsw i64 %1, 68719480832 %4 = or i64 %3, %2 ret i64 %4 } ``` Here is the LLVM IR produced by Clang for "equivalent" C code: ```llvm define dso_local range(i64 0, 4503599627370496) i64 @foo(i64 noundef %x) local_unnamed_addr { entry: %and = and i64 %x, 65535 %or = mul nuw nsw i64 %and, 16777217 %shl1 = mul nuw nsw i64 %and, 68719480832 %or2 = or i64 %shl1, %or ret i64 %or2 } define dso_local range(i64 0, 4503599627370496) i64 @bar(i16 noundef %x) local_unnamed_addr { entry: %conv = zext i16 %x to i64 %or = mul nuw nsw i64 %conv, 16777217 %shl1 = mul nuw nsw i64 %conv, 68719480832 %or2 = or i64 %shl1, %or ret i64 %or2 } declare void @llvm.dbg.value(metadata, metadata, metadata) #1 ``` And the assembly: ```asm foo: // @foo and x8, x0, #0xffff orr x8, x8, x8, lsl #24 orr x0, x8, x8, lsl #12 ret bar: // @bar and x8, x0, #0xffff orr x9, x8, x8, lsl #24 lsl x8, x8, #12 bfi x8, x0, #36, #16 orr x0, x8, x9 ret ``` On x86, compiling for Zen 4, I get: https://zig.godbolt.org/z/fYzqEWWvx ```asm foo: movzx ecx, di movabs rax, 68719480832 imul rax, rcx mov rdx, rcx shl rdx, 24 or rdx, rcx or rax, rdx ret bar: movabs rax, 68719480832 mov ecx, edi mov rdx, rcx shl rdx, 24 imul rax, rcx or rdx, rcx or rax, rdx ret ``` Looks like LLVM is making the decision that a multiply is less expensive than a `shl`? Also, why can't we use `shlx` here? I would think we could do: ```asm mov rax, rdi shl rax, 24 or rdi, rax mov rax, rdi shl rax, 12 or rax, rdi ```

llvm / llvm-project

Spurious optimization of shl+or routine #92211