ziglang / zig

General-purpose programming language and toolchain for maintaining robust, optimal, and reusable software.
https://ziglang.org
MIT License
35.07k stars 2.56k forks source link

runtime @unionInit of underaligned union type corrupts stack #21343

Open richRemer opened 2 months ago

richRemer commented 2 months ago

Zig Version

0.14.0-dev.1420+e5ee9c1e4

Also appears in 0.13.0 with slightly modified code to accommodate changes to how types are generated at comptime.

Steps to Reproduce and Observed Behavior

Note: I originally opened this issue as an "error message" problem, but after further investigation, I think this is a bug. q.v., #21307.

I'm getting a segfault due to stack corruption that appears to be caused by the @unionInit function. I'm attempting to generate an iterator over a tagged union at comptime. The following is the simplest test case I could come up with that compiles and triggers the issue:

const std = @import("std");

pub fn main() !void {
    var it = TaggedUnionIterator(&.{ "a", "b" }){ .tokens = &.{ "a", "A", "b", "B" } };

    std.debug.assert(it.index == 0);
    _ = it.next();
    std.debug.assert(it.index == 2);
    _ = it.next();
    std.debug.assert(it.index == 4);
    _ = it.next();
    std.debug.assert(it.index == 4);

    std.debug.print("succeeded\n", .{});
}

fn TaggedUnionIterator(tags: []const [:0]const u8) type {
    comptime var enum_fields: [tags.len]std.builtin.Type.EnumField = undefined;
    comptime var union_fields: [tags.len]std.builtin.Type.UnionField = undefined;

    for (tags, 0..) |tag, i| {
        enum_fields[i] = std.builtin.Type.EnumField{
            .name = tag,
            .value = i,
        };

        union_fields[i] = std.builtin.Type.UnionField{
            .name = tag,
            .type = []const u8,
            .alignment = 1,
        };
    }

    const EnumType = @Type(std.builtin.Type{ .@"enum" = std.builtin.Type.Enum{
        .tag_type = u8,
        .fields = &enum_fields,
        .decls = &[0]std.builtin.Type.Declaration{},
        .is_exhaustive = true,
    } });

    const UnionType = @Type(std.builtin.Type{ .@"union" = std.builtin.Type.Union{
        .layout = .auto,
        .tag_type = EnumType,
        .fields = &union_fields,
        .decls = &[0]std.builtin.Type.Declaration{},
    } });

    return struct {
        index: u8 = 0,
        tokens: []const [:0]const u8,

        pub fn init(tokens: []const [:0]const u8) @This() {
            return .{ .tokens = tokens };
        }

        pub fn next(it: *@This()) ?UnionType {
            if (it.index == it.tokens.len) {
                return null;
            } else {
                const token = it.tokens[it.index];
                const value = it.tokens[it.index + 1];

                it.index += 2;

                inline for (@typeInfo(UnionType).@"union".fields) |field| {
                    if (std.mem.eql(u8, token, field.name)) {
                        return @unionInit(UnionType, field.name, value);
                    }
                }

                return null;
            }
        }
    };
}
$ uname -a
Linux penguin 6.6.32-02877-gde0d50d4a56c #1 SMP PREEMPT_DYNAMIC Mon, 5 Aug 2024 22:06:23 +0000 x86_64 GNU/Linux

Running the code produces:

Bus error at address 0x0
/home/rremer/Projects/zig/iteropt/bus-error.zig:10:24: 0x1037cd4 in main (bus-error)
    std.debug.assert(it.index == 4);
                       ^
Unwind error at address `exe:0x1037cd4` (error.InvalidCFA), trace may be incomplete

Aborted (core dumped)

Running with -O ReleaseFast seems to workaround the issue, producing the correct "succeeded" result. In addition, someone else was able to determine it runs fine on Asahi Linux on ARM. But fails on macOS Sonoma (Apple M1, aarch64) and on the Arch Linux x86_64 with a 6.10 kernel.

Debugging the issue, I can see that when the error above is generated, it is because of an attempt to dereference a corrupted RBP. Diving deeper, I determined that RBP - as expected - is being restored from the stack before exiting the .next() function, but that memory on the stack has the wrong value when it is restored. Further, I tracked down the instruction that corrupts the stack and found it to be on the line with @unionInit. The generated code that causes the issue can be seen here:

Screenshot 2024-09-08 2 44 27 AM

When that instruction runs, it writes to the high word of the pushed RBP value. So when RBP is popped at 0x1038100, the wrong value ends up in RBP. I don't know anything about how Zig does codegen, but [rbp+0x6] seems very suspicious. Presumably, the pushed RBP should be right after the return address for the call, so all RBP relative addressing for local variables should be [rbp-...] (never +).

Some additional discussion can be found in the original issue #21307.

Expected Behavior

Should successfully run without error and print:

succeeded
mlugg commented 2 months ago

Reduction:

pub fn main() !void {
    var x: u8 = undefined;
    x = 0;
    unionNop();
    ignore(x);
}

const MyUnion = union(enum) {
    a: u64 align(1),
    b: void,
};

pub fn unionNop() void {
    var x: MyUnion = undefined;
    x = @unionInit(MyUnion, "a", getVal());
}

fn getVal() u64 {
    return 123;
}

fn ignore(_: u8) void {}

This triggers a segfault rather than a bus error, but it's the same issue.

mlugg commented 2 months ago

Okay, this is our bug, not upstream's.

Here's some reduced LLVM IR which is quite similar to what we emit:

; ModuleID = 'foo'
source_filename = "foo"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: nounwind
define dso_local void @_start() #0 {
0:
  %1 = alloca [1 x i8], align 1
  call fastcc void @foo.unionNop()
  %2 = load i8, ptr %1, align 1
  call fastcc void @foo.ignore(i8 %2)
  br label %3
3:
  br label %3
}

; Function Attrs: nounwind
define internal fastcc void @foo.unionNop() unnamed_addr #0 {
0:
  %1 = alloca [9 x i8], align 1
  %2 = getelementptr inbounds { i8, i64 }, ptr %1, i64 0, i32 1
  store i64 123, ptr %2, align 1
  ret void
}

; Function Attrs: nounwind
define internal fastcc void @foo.ignore(i8 %0) unnamed_addr #0 {
1:
  ret void
}

attributes #0 = { nounwind }

The error here is visible in unionNop -- we lower the union to a non-packed structure type, so LLVM assumes 7 bytes of padding between the tag and payload, but we've only alloc'd 9 bytes rather than the 16 that would require. In this case, that causes us to corrupt the return address on the stack; in the original repro, we corrupt some other random state.