u24 generates inefficient code

SpexGuy commented 3 years ago

At the heart of this is the question "how big is a u24?". @sizeOf would say it's 4 bytes. This is how much space is reserved for it. But the compiler seems to only ever read three bytes.

export fn foo(bytes: *const [2]u24) u32 {
    return bytes[0] + bytes[1];
}

I would expect the compiler to generate this code:

mov     eax, dword ptr [rdi]
add     eax, dword ptr [rdi + 4]
and     eax, 16777215
ret

But instead it generates this in release-fast:

movzx   eax, word ptr [rdi]
movzx   ecx, byte ptr [rdi + 2]
shl     ecx, 16
or      ecx, eax
movzx   edx, word ptr [rdi + 4]
movzx   eax, byte ptr [rdi + 6]
shl     eax, 16
or      eax, edx
add     eax, ecx
and     eax, 16777215
ret

This means we're getting the worst of both worlds. We're paying the performance cost of using only three bytes for a u24 and the memory cost of using four bytes for a u24. This is just silly. We need to resolve this one way or the other. Either keep the three-byte reads and say @sizeOf(u24) == 3 and @alignOf(u24) == 1, or switch to four byte reads and keep the current size and alignment.

kliberty commented 3 years ago

I looked at a few other cases, with slight modifications. The inefficient code generation seems to have something to do with the export. If you can convince the compiler to reinterpret the pointers then the generated code looks better.

export_foo_style_internal_with_foo_style_u24_ptr_export_upconvert_downconvert seems the most telling. It has the same signature as your example and generates the code you expected.

export fn bad_foo(bytes: *const [2]u24) u32 {
    const a = bytes[0] + bytes[1];
    return a;
}

fn foo_style_internal(bytes: *const [2]u24) u32 {
    const a = bytes[0] + bytes[1];
    return a;
}

export fn foo_style_u32_ptr_truncated(bytes: *const [2]u32) u32 {
    const a = @truncate(u24, bytes[0]);
    const b = @truncate(u24, bytes[1]);
    return a + b;
}

export fn foo_style_u25_ptr_truncated(bytes: *const [2]u25) u32 {
    const a = @truncate(u24, bytes[0]);
    const b = @truncate(u24, bytes[1]);
    return a + b;
}

export fn export_foo_style_internal_with_foo_style_u32_ptr(bytes: *const [2]u32) u32 {
    const c: *[2]u24 = &[2]u24{
        @truncate(u24, bytes[0]),
        @truncate(u24, bytes[1])
    };
    return foo_style_internal(c);
}

export fn export_foo_style_internal_with_foo_style_u25_ptr(bytes: *const [2]u25) u32 {
    const c: *[2]u24 = &[2]u24{
        @truncate(u24,bytes[0]),
        @truncate(u24,bytes[1])
    };
    return foo_style_internal(c);
}

export fn export_foo_style_internal_with_foo_style_u24_ptr_export_upconvert_downconvert(bytes2: *const [2]u24) u32 {
    const bytes = @ptrCast(*const [2]u32, bytes2);
    const c: *[2]u24 = &[2]u24{
        @truncate(u24, bytes[0]),
        @truncate(u24, bytes[1])
    };
    return foo_style_internal(c);
}

bad_foo:
        movzx   eax, word ptr [rdi]
        movzx   ecx, byte ptr [rdi + 2]
        shl     ecx, 16
        or      ecx, eax
        movzx   edx, word ptr [rdi + 4]
        movzx   eax, byte ptr [rdi + 6]
        shl     eax, 16
        or      eax, edx
        add     eax, ecx
        and     eax, 16777215
        ret

foo_style_u32_ptr_truncated:
        mov     eax, dword ptr [rdi + 4]
        add     eax, dword ptr [rdi]
        and     eax, 16777215
        ret

foo_style_u25_ptr_truncated:
        mov     eax, dword ptr [rdi]
        add     eax, dword ptr [rdi + 4]
        and     eax, 16777215
        ret

export_foo_style_internal_with_foo_style_u32_ptr:
        mov     eax, dword ptr [rdi + 4]
        add     eax, dword ptr [rdi]
        and     eax, 16777215
        ret

export_foo_style_internal_with_foo_style_u25_ptr:
        mov     eax, dword ptr [rdi]
        add     eax, dword ptr [rdi + 4]
        and     eax, 16777215
        ret

export_foo_style_internal_with_foo_style_u24_ptr_export_upconvert_downconvert:
        mov     eax, dword ptr [rdi + 4]
        add     eax, dword ptr [rdi]
        and     eax, 16777215
        ret

LemonBoy commented 3 years ago

define i32 @bad_foo([2 x i24]* nocapture nonnull readonly %0) local_unnamed_addr #0 {
Entry:
  %1 = bitcast [2 x i24]* %0 to i24*
  %2 = load i24, i24* %1, align 4
  %3 = getelementptr inbounds [2 x i24], [2 x i24]* %0, i64 0, i64 1
  %4 = load i24, i24* %3, align 4
  %5 = add nuw i24 %4, %2
  %6 = zext i24 %5 to i32
  ret i32 %6
}

define i32 @export_foo_style_internal_with_foo_style_u24_ptr_export_upconvert_downconvert([2 x i24]* nocapture nonnull readonly %0) local_unnamed_addr #0 {
Entry:
  %1 = bitcast [2 x i24]* %0 to i32*
  %2 = load i32, i32* %1, align 4
  %3 = getelementptr inbounds [2 x i24], [2 x i24]* %0, i64 0, i64 1
  %4 = bitcast i24* %3 to i32*
  %5 = load i32, i32* %4, align 4
  %6 = add i32 %5, %2
  %7 = and i32 %6, 16777215
  ret i32 %7
}

No wonder the codegen is better, it's operating on i32.

andrewrk commented 3 years ago

I think the OP example should be a compilation error - u24 is not a type recognized by the C ABI.

andrewrk commented 3 years ago

My apologies, I didn't see the example clearly:

export fn foo(bytes: *const [2]u24) u32 {

This is a pointer, and the current stance on pointers in C calling convention functions is that they are allowed because they basically constitute an opaque pointer to C but in zig they have whatever true type they have.

We have plenty of examples of this already, here's a couple:

https://github.com/ziglang/zig/blob/ce65533985caa9e2da567948e36d7d4ba0185005/src/stage1.zig#L275-L340

This all works great.

So I would expect u24 to follow the same rules. As for the inefficient code, I think because we tell LLVM the function is external, LLVM thinks it has to follow some specific kind of ABI. But we should be able to massage the parameters into telling LLVM how the real ABI of the u24 type works for zig.

ziglang / zig

u24 generates inefficient code #7336