llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
26.77k stars 10.97k forks source link

[RISC-V] Add some orc.b optimizations #96595

Open Validark opened 1 week ago

Validark commented 1 week ago

Godbolt link

The following functions can all use, or should be directly translated to, the orc.b instruction:

// If the lowest bit of each byte is 1, set that byte to 255, else 0.
// Should emit constant-load+and+orc.b
export fn orc_b_opportunity1(x: u64) u64 {
    return (x & 0x0101010101010101) *% 0xFF;
}

// If the lowest bit of each byte is 1, set that byte to 255, else 0.
// Should emit constant-load+and+orc.b
export fn orc_b_opportunity1_fixed(x: u64) u64 {
    return orc_b(x & 0x0101010101010101);
}

// If the highest bit of each byte is 1, set that byte to 255, else 0.
// Should emit constant-load+and+orc.b
export fn orc_b_opportunity2(x: u64) u64 {
    return ((x & 0x8080808080808080) >> 7) *% 0xFF;
}

// If the highest bit of each byte is 1, set that byte to 255, else 0.
// Should emit constant-load+and+orc.b
export fn orc_b_opportunity3(x: u64) u64 {
    const y = x & 0x8080808080808080;
    return (y << 1) -% (y >> 7);
}

// If the highest bit of each byte is 1, set that byte to 255, else 0.
// Should emit constant-load+and+orc.b
export fn orc_b_opportunity4(x: u64) u64 {
    return ((x >> 7) & 0x0101010101010101) *% 0xFF;
}

// If the highest bit of each byte is 1, set that byte to 255, else 0.
export fn orc_b_opportunity2_and_3_and_4_fixed(x: u64) u64 {
    return orc_b(x & 0x8080808080808080);
}

// If any byte is non-zero, set that byte to 255 (all ones), else 0.
// This should just emit the `orc.b` instruction.
export fn orc_b_opportunity5(x: u64) u64 {
    const c1: u64 = @bitCast([_]u8{0x7F} ** 8);
    return orc_b_opportunity2(((x & c1) +% c1) | x);
}

// If any byte is non-zero, set that byte to 255 (all ones), else 0.
// This should just emit the `orc.b` instruction.
export fn orc_b_opportunity6(x: u64) u64 {
    const c1: u64 = @bitCast([_]u8{0x7F} ** 8);
    return orc_b_opportunity3(((x & c1) +% c1) | x);
}

// If any byte is non-zero, set that byte to 255 (all ones), else 0.
// This should just emit the `orc.b` instruction.
export fn orc_b_opportunity7(x: u64) u64 {
    const c1: u64 = @bitCast([_]u8{0x7F} ** 8);
    return orc_b_opportunity4(((x & c1) +% c1) | x);
}

export fn orc_b_opportunity5_and_6_and_7_fixed(x: u64) u64 {
    return orc_b(x);
}

fn orc_b(x: u64) u64 {
    return struct {
        extern fn @"llvm.riscv.orc.b.i64"(u64) u64;
    }.@"llvm.riscv.orc.b.i64"(x);
}
llvmbot commented 1 week ago

@llvm/issue-subscribers-backend-risc-v

Author: Niles Salter (Validark)

[Godbolt link](https://godbolt.org/z/oh87najaG) The following functions can all use, or should be directly translated to, the orc.b instruction: ```zig // If the lowest bit of each byte is 1, set that byte to 255, else 0. // Should emit constant-load+and+orc.b export fn orc_b_opportunity1(x: u64) u64 { return (x & 0x0101010101010101) *% 0xFF; } // If the lowest bit of each byte is 1, set that byte to 255, else 0. // Should emit constant-load+and+orc.b export fn orc_b_opportunity1_fixed(x: u64) u64 { return orc_b(x & 0x0101010101010101); } // If the highest bit of each byte is 1, set that byte to 255, else 0. // Should emit constant-load+and+orc.b export fn orc_b_opportunity2(x: u64) u64 { return ((x & 0x8080808080808080) >> 7) *% 0xFF; } // If the highest bit of each byte is 1, set that byte to 255, else 0. // Should emit constant-load+and+orc.b export fn orc_b_opportunity3(x: u64) u64 { const y = x & 0x8080808080808080; return (y << 1) -% (y >> 7); } // If the highest bit of each byte is 1, set that byte to 255, else 0. // Should emit constant-load+and+orc.b export fn orc_b_opportunity4(x: u64) u64 { return ((x >> 7) & 0x0101010101010101) *% 0xFF; } // If the highest bit of each byte is 1, set that byte to 255, else 0. export fn orc_b_opportunity2_and_3_and_4_fixed(x: u64) u64 { return orc_b(x & 0x8080808080808080); } // If any byte is non-zero, set that byte to 255 (all ones), else 0. // This should just emit the `orc.b` instruction. export fn orc_b_opportunity5(x: u64) u64 { const c1: u64 = @bitCast([_]u8{0x7F} ** 8); return orc_b_opportunity2(((x & c1) +% c1) | x); } // If any byte is non-zero, set that byte to 255 (all ones), else 0. // This should just emit the `orc.b` instruction. export fn orc_b_opportunity6(x: u64) u64 { const c1: u64 = @bitCast([_]u8{0x7F} ** 8); return orc_b_opportunity3(((x & c1) +% c1) | x); } // If any byte is non-zero, set that byte to 255 (all ones), else 0. // This should just emit the `orc.b` instruction. export fn orc_b_opportunity7(x: u64) u64 { const c1: u64 = @bitCast([_]u8{0x7F} ** 8); return orc_b_opportunity4(((x & c1) +% c1) | x); } export fn orc_b_opportunity5_and_6_and_7_fixed(x: u64) u64 { return orc_b(x); } fn orc_b(x: u64) u64 { return struct { extern fn @"llvm.riscv.orc.b.i64"(u64) u64; }.@"llvm.riscv.orc.b.i64"(x); } ```
dtcxzyw commented 1 week ago

Does this pattern exist in some real-world applications (e.g., some trivial string-processing implementations)?

Validark commented 1 week ago

Does this pattern exist in some real-world applications (e.g., some trivial string-processing implementations)?

These are standard SWAR operations. SWAR is used for:

I use these exact routines in my Accelerated Zig Parser for use on the Sifive-u74 and other machines which lack vectors (although the sifive-u74 also lacks the fancy bit-manipulation instructions too). Note that I hope that one day I can remove the SWAR routines from my code and let the compiler translate my vector code to the SWAR routines I have hand-coded today. However, the very fact that orc.b exists at all is evidence that the RISC-V designers thought there could/should be machines with the B bit-manipulation extension without the V vector extension (which seems to contradict the fact that they designed the vector extension to work without adding mask registers for "ease of implementation" for low-end hardware). Also the fact that orc.b exists indicates that the RISC-V designers thought real-world applications could use it.