ziglang / zig

General-purpose programming language and toolchain for maintaining robust, optimal, and reusable software.
https://ziglang.org
MIT License
33.68k stars 2.47k forks source link

Missed optimization for functions returning a normal struct. #14358

Open IntegratedQuantum opened 1 year ago

IntegratedQuantum commented 1 year ago

Zig Version

0.11.0-dev.1201+301a89849

Benchmark program

const std = @import("std");

const Regular = struct {
    x: u32,
    y: u32,
};

const Packed = packed struct(u64) {
    x: u32,
    y: u32,
};

const Extern = extern struct {
    x: u32,
    y: u32,
};

fn regular(x: u32, y: u32) Regular {
    return Regular{.x = x ^ y, .y = ~(x ^ y)};
}

fn _packed(x: u32, y: u32) Packed {
    return Packed{.x = x ^ y, .y = ~(x ^ y)};
}

export fn _extern(x: u32, y: u32) Extern {
    return Extern{.x = x ^ y, .y = ~(x ^ y)};
}

pub fn time(comptime func: anytype, range: u32) void {
    var start = std.time.nanoTimestamp();
    var sumX: u32 = 0;
    var sumY: u32 = 0;
    var x: u32 = 0;
    while(x < range) : (x += 1) {
        var y: u32 = 1;
        while(y < range) : (y += 1) {
            const val = @call(.never_inline, func, .{x, y});
            sumX +%= val.x;
            sumY +%= val.y;
        }
    }
    std.log.err("SumX: {} SumY: {} Time: {}", .{sumX, sumY, std.time.nanoTimestamp() - start});
}

pub fn main() void {
    time(regular, 10000);
    time(_packed, 10000);
    time(_extern, 10000);
}

Benchmark result

$ zig run test.zig -OReleaseFast
error: SumX: 2121939592 SumY: 2073037704 Time: 1013559919
error: SumX: 2121939592 SumY: 2073037704 Time: 139970202
error: SumX: 2121939592 SumY: 2073037704 Time: 140011389

As you can see the regular struct is about 7 times slower than the extern and packed structs.

Godbolt

On godbolt you can observe that the packed and extern structs seem to push their values into a single register before returning, while the regular struct puts the values onto the stack which is more expensive.

Bonus

When running the program in debug the regular struct appears to be faster than in ReleaseFast:

$ zig run test.zig
error: SumX: 2121939592 SumY: 2073037704 Time: 577106014
error: SumX: 2121939592 SumY: 2073037704 Time: 1009756258
error: SumX: 2121939592 SumY: 2073037704 Time: 1185702833
judofyr commented 2 months ago

Are there any known work arounds for this which works with arbitrary Zig types (e.g. which are not valid in packed/extern)? I'd really like to be able to return multiple values where some of them could be put into registers.