sarah-ek / gemm

MIT License
72 stars 11 forks source link

gemm_f16: Build fails in debug mode for AArch64 #31

Open brunocaballero opened 1 month ago

brunocaballero commented 1 month ago

Hi,

I created a small Rust example:

use gemm_f16::f16;

fn main() {
    println!("Hello, fp16!");

    let a = f16::from_f32(3.1f32);
    let b = f16::from_f32(2.2f32);

    let c = a * b;
    if c.is_normal() {
        println!("Is normal!");
    }
    println!("Result {c}")

}

Building in release mode for target AArch64/Linux works, but it fails when building in debug mode.

error: instruction requires: fullfp16

But I am not sure in which context fillfp16 is not supported, maybe the llvm toolchain?

microdoc@microdoc-tools-builder:~/proj/f16tests$ cargo build --release --target aarch64-unknown-linux-gnu
    Finished `release` profile [optimized] target(s) in 0.02s
microdoc@microdoc-tools-builder:~/proj/f16tests$ cargo build --target aarch64-unknown-linux-gnu
   Compiling reborrow v0.5.5
   Compiling cfg-if v1.0.0
   Compiling libm v0.2.8
   Compiling crossbeam-utils v0.8.20
   Compiling rayon-core v1.12.1
   Compiling either v1.12.0
   Compiling bitflags v1.3.2
   Compiling once_cell v1.19.0
   Compiling num-traits v0.2.19
   Compiling bytemuck v1.16.1
   Compiling raw-cpuid v10.7.0
   Compiling dyn-stack v0.10.0
   Compiling crossbeam-epoch v0.9.18
   Compiling crossbeam-deque v0.8.5
   Compiling rayon v1.10.0
   Compiling num-complex v0.4.6
   Compiling half v2.4.1
   Compiling pulp v0.18.21
   Compiling gemm-common v0.18.0
   Compiling gemm-f32 v0.18.0
   Compiling gemm-f16 v0.18.0
error: instruction requires: fullfp16
    --> /home/microdoc/.cargo/registry/src/index.crates.io-6f17d22bba15001f/gemm-common-0.18.0/src/simd.rs:2000:18
     |
2000 |                 "fmla {0:v}.8h, {1:v}.8h, {2:v}.h[3]",
     |                  ^
     |
note: instantiated into assembly here
    --> <inline asm>:1:2
     |
1    |     fmla v0.8h, v1.8h, v2.h[3]
     |     ^

error: instruction requires: fullfp16
    --> /home/microdoc/.cargo/registry/src/index.crates.io-6f17d22bba15001f/gemm-common-0.18.0/src/simd.rs:2018:18
     |
2018 |                 "fmla {0:v}.8h, {1:v}.8h, {2:v}.h[6]",
     |                  ^
     |
note: instantiated into assembly here
    --> <inline asm>:1:2
     |
1    |     fmla v0.8h, v1.8h, v2.h[6]
     |     ^

error: instruction requires: fullfp16
    --> /home/microdoc/.cargo/registry/src/index.crates.io-6f17d22bba15001f/gemm-common-0.18.0/src/simd.rs:2006:18
     |
2006 |                 "fmla {0:v}.8h, {1:v}.8h, {2:v}.h[4]",
     |                  ^
     |
note: instantiated into assembly here
    --> <inline asm>:1:2
     |
1    |     fmla v0.8h, v1.8h, v2.h[4]
     |     ^

error: instruction requires: fullfp16
    --> /home/microdoc/.cargo/registry/src/index.crates.io-6f17d22bba15001f/gemm-common-0.18.0/src/simd.rs:1988:18
     |
1988 |                 "fmla {0:v}.8h, {1:v}.8h, {2:v}.h[1]",
     |                  ^
     |
note: instantiated into assembly here
    --> <inline asm>:1:2
     |
1    |     fmla v0.8h, v1.8h, v2.h[1]
     |     ^

error: instruction requires: fullfp16
    --> /home/microdoc/.cargo/registry/src/index.crates.io-6f17d22bba15001f/gemm-common-0.18.0/src/simd.rs:2024:18
     |
2024 |                 "fmla {0:v}.8h, {1:v}.8h, {2:v}.h[7]",
     |                  ^
     |
note: instantiated into assembly here
    --> <inline asm>:1:2
     |
1    |     fmla v0.8h, v1.8h, v2.h[7]
     |     ^

error: instruction requires: fullfp16
    --> /home/microdoc/.cargo/registry/src/index.crates.io-6f17d22bba15001f/gemm-common-0.18.0/src/simd.rs:2012:18
     |
2012 |                 "fmla {0:v}.8h, {1:v}.8h, {2:v}.h[5]",
     |                  ^
     |
note: instantiated into assembly here
    --> <inline asm>:1:2
     |
1    |     fmla v0.8h, v1.8h, v2.h[5]
     |     ^

error: instruction requires: fullfp16
    --> /home/microdoc/.cargo/registry/src/index.crates.io-6f17d22bba15001f/gemm-common-0.18.0/src/simd.rs:1982:18
     |
1982 |                 "fmla {0:v}.8h, {1:v}.8h, {2:v}.h[0]",
     |                  ^
     |
note: instantiated into assembly here
    --> <inline asm>:1:2
     |
1    |     fmla v0.8h, v1.8h, v2.h[0]
     |     ^

error: instruction requires: fullfp16
    --> /home/microdoc/.cargo/registry/src/index.crates.io-6f17d22bba15001f/gemm-common-0.18.0/src/simd.rs:1994:18
     |
1994 |                 "fmla {0:v}.8h, {1:v}.8h, {2:v}.h[2]",
     |                  ^
     |
note: instantiated into assembly here
    --> <inline asm>:1:2
     |
1    |     fmla v0.8h, v1.8h, v2.h[2]
     |     ^

error: instruction requires: fullfp16
    --> /home/microdoc/.cargo/registry/src/index.crates.io-6f17d22bba15001f/gemm-common-0.18.0/src/simd.rs:1954:18
     |
1954 |                 "fadd {0:v}.8h, {1:v}.8h, {2:v}.8h",
     |                  ^
     |
note: instantiated into assembly here
    --> <inline asm>:1:2
     |
1    |     fadd v0.8h, v1.8h, v2.8h
     |     ^

error: instruction requires: fullfp16
    --> /home/microdoc/.cargo/registry/src/index.crates.io-6f17d22bba15001f/gemm-common-0.18.0/src/simd.rs:1966:18
     |
1966 |                 "fmla {0:v}.8h, {1:v}.8h, {2:v}.8h",
     |                  ^
     |
note: instantiated into assembly here
    --> <inline asm>:1:2
     |
1    |     fmla v0.8h, v1.8h, v2.8h
     |     ^

error: instruction requires: fullfp16
    --> /home/microdoc/.cargo/registry/src/index.crates.io-6f17d22bba15001f/gemm-common-0.18.0/src/simd.rs:1940:18
     |
1940 |                 "fmul {0:v}.8h, {1:v}.8h, {2:v}.8h",
     |                  ^
     |
note: instantiated into assembly here
    --> <inline asm>:1:2
     |
1    |     fmul v0.8h, v1.8h, v2.8h
     |     ^

error: could not compile `gemm-f16` (lib) due to 11 previous errors
sarah-ek commented 1 month ago

this is a known issue but I don't know how to fix it

koutheir commented 1 month ago

I found two ways to fix this. But before this, I modified the sample above slightly in order to avoid optimizations ruining what we're trying to test here. Though, even if the sample above is used as is, the fix below still works.

use gemm_f16::f16;

fn main() {
    println!("Hello, fp16!");

    let a = core::hint::black_box(f16::from_f32(3.1f32));
    let b = core::hint::black_box(f16::from_f32(2.2f32));

    let c = core::hint::black_box(|| a * b)();

    if c.is_normal() {
        println!("Is normal!");
    }

    println!("Result {c}")
}

I started by creating the file .cargo/config.toml in order to tell cargo that I'm targeting AArch64:

[build]
# $ rustup target add aarch64-unknown-linux-musl
target = "aarch64-unknown-linux-musl"

And then I added the following section to that file:

[target.aarch64-unknown-linux-musl]
linker = "clang"
rustflags = [
    "-Clink-arg=--target=aarch64-unknown-linux-musl",
    "-Clink-arg=-fuse-ld=lld",
    "-Ctarget-feature=+fp16,+fhm"
]

If the version of clang/lld installed on the system is too old, then download and extract a recent clang/lld toolchain somewhere, and use the following instead:

[target.aarch64-unknown-linux-musl]
linker = "clang-18"
rustflags = [
    "-Clink-arg=--target=aarch64-unknown-linux-musl",
    "-Clink-arg=-fuse-ld=lld-18",
    "-Ctarget-feature=+fp16,+fhm"
]

If gcc is preferred, then download and extract a recent cross-compilation gcc toolchain for AArch64 somewhere, and use the following instead:

linker = "<somewhere>/arm-gnu-toolchain-13.3.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-gcc"
rustflags = [ "-Ctarget-feature=+fp16,+fhm" ]

Once that is done, running cargo build and cargo build --release both succeed, and disassembling the binaries shows (among others) the instructions:

0000000000224f08 <half::binary16::arch::aarch64::multiply_f16_fp16>:
  ...
  224f14:   1e270000    fmov    s0, w0
  224f18:   1e204001    fmov    s1, s0
  224f1c:   1e270020    fmov    s0, w1
  224f20:   1e204002    fmov    s2, s0
  224f24:   1ee20820    fmul    h0, h1, h2
  ...

and also:

0000000000224adc <fp16::main>:
  ...
  224b44:   7d400100    ldr h0, [x8]
  224b48:   7d400121    ldr h1, [x9]
  224b4c:   1ee10802    fmul    h2, h0, h1
  224b50:   1e260048    fmov    w8, s2
  ...

Running the binaries through QEmu shows:

$ target/aarch64-unknown-linux-musl/debug/fp16
Hello, fp16!
Is normal!
Result 6.8164063

$ target/aarch64-unknown-linux-musl/release/fp16
Hello, fp16!
Is normal!
Result 6.8164063

For reference, I found the feature names fp16 and fhm (specified above in -Ctarget-feature=+fp16,+fhm) through the following command:

$ rustc --target=aarch64-unknown-linux-musl --print target-features
Features supported by rustc for this target:
    ...
    fhm                                - Enable FP16 FML instructions (FEAT_FHM).
    flagm                              - Enable v8.4-A Flag Manipulation Instructions (FEAT_FlagM).
    fp16                               - Full FP16 (FEAT_FP16).
    ...
cirocavani commented 2 weeks ago

Raspberry Pi 5 with Raspberry Pi OS 64bits (Debian 12 bookworm) / Rust 1.80, the solution also works.

.cargo/config.toml

[build]
rustflags = [
    "-Ctarget-feature=+fp16,+fhm"
]

Thank you for sharing.