`threadlocal` loaded too early and not updating on new thread

bhansconnect commented 1 month ago

Zig Version

0.13.0

Steps to Reproduce and Observed Behavior

zig run main.zig on this file on aarch64 unix or x86_64 unix (I specifically testing on aarch64 macos and x86_64 linux).

output:

info: thread 1 addr: usize@<ADDRESS1>
info: Error -> thread 2 addr: usize@<ADDRESS1>
info: Workaround -> thread 2 addr: usize@<ADDRESS2>

main.zig

const std = @import("std");
const builtin = @import("builtin");

// I'm working on a coroutine library.
// Zig seems to load threadlocals incorrectly when coroutines switch threads.
// I think it loads them too early then returns.

threadlocal var thread_local: usize = 7;

var main_context: [context_size]usize = std.mem.zeroes([context_size]usize);
var repro_context: [context_size]usize = std.mem.zeroes([context_size]usize);

pub fn main() !void {
    // Setup the context on one thread.
    const t = try std.Thread.spawn(.{}, thread, .{});
    t.join();

    // Run the context on another thread.
    switch_context(&main_context, &repro_context);
}

fn thread() void {
    setup_repro_context();
    switch_context(&main_context, &repro_context);
}

fn repro() void {
    // This collects the thread local address on the starting thread.
    const thread1_addr = &thread_local;
    // We context switch to suspend this execution
    switch_context(&repro_context, &main_context);
    // When we get back, we are on a different thread.
    // This should load a different address.
    // That is not happening.
    const thread2_addr = &thread_local;
    std.log.info("thread 1 addr: {*}", .{thread1_addr});
    std.log.info("Error -> thread 2 addr: {*}", .{thread2_addr});
    workaround();

    // Go back to main to exit the program.
    switch_context(&repro_context, &main_context);
}

noinline fn workaround() void {
    // Here it does work due to to separate function causing the address to actually be loaded.
    const thread2_addr = &thread_local;
    std.log.info("Workaround -> thread 2 addr: {*}", .{thread2_addr});
}

// Put everything inline for simplicity.
// This only works on aarch64 and x86_64 unix.
extern fn switch_context(current: [*]u64, target: [*]u64) void;
comptime {
    switch (builtin.cpu.arch) {
        .aarch64 => {
            asm (
                \\ .global _switch_context
                \\ _switch_context:
                \\ .global switch_context
                \\ switch_context:
                \\ 
                \\ # Save all callee saved registers to current context.
                \\ mov x9, sp
                \\ # General
                \\ stp x19, x20, [x0, 0*8]
                \\ stp x21, x22, [x0, 2*8]
                \\ stp x23, x24, [x0, 4*8]
                \\ stp x25, x26, [x0, 6*8]
                \\ stp x27, x28, [x0, 8*8]
                \\ # Double
                \\ stp d8,  d9,  [x0, 10*8]
                \\ stp d10, d11, [x0, 12*8]
                \\ stp d12, d13, [x0, 14*8]
                \\ stp d14, d15, [x0, 16*8]
                \\ # Special
                \\ stp fp,  lr,  [x0, 18*8]
                \\ str x9,       [x0, 20*8]
                \\ 
                \\ 
                \\ # Load target context.
                \\ # General
                \\ ldp x19, x20, [x1, 0*8]
                \\ ldp x21, x22, [x1, 2*8]
                \\ ldp x23, x24, [x1, 4*8]
                \\ ldp x25, x26, [x1, 6*8]
                \\ ldp x27, x28, [x1, 8*8]
                \\ # Double
                \\ ldp d8,  d9,  [x1, 10*8]
                \\ ldp d10, d11, [x1, 12*8]
                \\ ldp d12, d13, [x1, 14*8]
                \\ ldp d14, d15, [x1, 16*8]
                \\ # Special
                \\ ldp fp,  lr,  [x1, 18*8]
                \\ ldr x9,       [x1, 20*8]
                \\ mov sp, x9
                \\ 
                \\ ret
            );
        },
        .x86_64 => {
            if (builtin.os.tag == .windows) {
                @compileError("Unsupported os");
            }
            asm (
                \\ .global _switch_context
                \\ _switch_context:
                \\ .global switch_context
                \\ switch_context:
                \\ 
                \\ # Save all callee saved registers to current context.
                \\ # General
                \\ movq %r12, 0x00(%rdi)
                \\ movq %r13, 0x08(%rdi)
                \\ movq %r14, 0x10(%rdi)
                \\ movq %r15, 0x18(%rdi)
                \\ movq %rbx, 0x20(%rdi)
                \\ # Special
                \\ movq %rbp, 0x28(%rdi)
                \\ movq %rsp, 0x30(%rdi)
                \\ 
                \\ 
                \\ # Load target context.
                \\ # General
                \\ movq 0x00(%rsi), %r12
                \\ movq 0x08(%rsi), %r13
                \\ movq 0x10(%rsi), %r14
                \\ movq 0x18(%rsi), %r15
                \\ movq 0x20(%rsi), %rbx
                \\ # Special
                \\ movq 0x28(%rsi), %rbp
                \\ movq 0x30(%rsi), %rsp
                \\ 
                \\ ret
            );
        },
        else => @compileError("Unsupported cpu architecture"),
    }
}

const context_size =
    switch (builtin.cpu.arch) {
    .aarch64 => 21,
    .x86_64 => 7,
    else => @compileError("Unsupported cpu architecture"),
};

const STACK_ALIGN = 16;
var repro_stack: [16 * 1024 * 1024]u8 = std.mem.zeroes([16 * 1024 * 1024]u8);

fn setup_repro_context() void {
    var sp = @as([*]u8, @ptrCast(&repro_stack)) + repro_stack.len;

    switch (builtin.cpu.arch) {
        .aarch64 => {
            // Ensure stack is aligned.
            sp = @ptrFromInt(@intFromPtr(sp) & ~@as(usize, (STACK_ALIGN - 1)));

            const frame_pointer_index = 18;
            const return_pointer_index = 19;
            const stack_pointer_index = 20;
            repro_context[stack_pointer_index] = @intFromPtr(sp);
            repro_context[frame_pointer_index] = @intFromPtr(sp);
            repro_context[return_pointer_index] = @intFromPtr(&repro);
        },
        .x86_64 => {
            // Makes space to store the return address on the stack.
            sp -= @sizeOf(usize);
            // Ensure stack is aligned.
            sp = @ptrFromInt(@intFromPtr(sp) & ~@as(usize, (STACK_ALIGN - 1)));

            const return_address_ptr = @as(*usize, @alignCast(@ptrCast(sp)));
            return_address_ptr.* = @intFromPtr(&repro);

            const frame_pointer_index = 5;
            const stack_pointer_index = 6;
            repro_context[stack_pointer_index] = @intFromPtr(sp);
            repro_context[frame_pointer_index] = @intFromPtr(sp);
        },
        else => @compileError("Unsupported cpu architecture"),
    }
}

Expected Behavior

The address of threadlocals should be loaded at every single time it is used. Zig only loads the address once for the entire repro function. This leads to printing the incorrect address. Even without the first print, zig will load the address at the start of the function. This means it still incorrectly prints the thread one address.

Expected:

info: thread 1 addr: usize@<ADDRESS1>
info: Error -> thread 2 addr: usize@<ADDRESS2>
info: Workaround -> thread 2 addr: usize@<ADDRESS2>

Just to be more explicit, the change in the output is that address on the middle line matches the last line instead of the first line.

alexrp commented 1 month ago

I think the compiler (LLVM, really) is well within its rights to assume that normal-looking code won't just suddenly switch to running on a different thread out of nowhere. You have not in any way told the compiler that such a thing could happen, so it's completely reasonable for it to cache computations of threadlocal variable addresses. I'd bet you could also run into some fun bugs around atomics and memory ordering here.

What's needed here is a way to communicate to the compiler that it shouldn't do that caching. See https://github.com/llvm/llvm-project/issues/98479 and https://github.com/llvm/llvm-project/issues/111257. This is not a problem we can solve on the Zig side without LLVM having some mechanism for this.

rohlem commented 1 month ago

Totally out of my depth, but for asm writing to memory there's this memory clobber annotation I've seen around. Maybe a similar solution could be to introduce a type of threadlocal (address) clobber for asm statements? (Which probably needs to include LLVM support for our LLVM backend.)

Or alternatively, maybe you could find a way to load the TLS address yourself in an asm block (might need to be volatile asm?) so the compilation pipeline doesn't interfere with your intent? That would still mean normal userland uses of threadlocal are broken in a function that transitively contains context switching, but maybe a helper function load_tls("name") is a desirable workaround to re-implement a library solution in userland?

bhansconnect commented 1 month ago

think the compiler (LLVM, really) is well within its rights to assume that normal-looking code won't just suddenly switch to running on a different thread out of nowhere.

Totally fair. Some reason, I thought this Worked in clang. But it doesn't: https://godbolt.org/z/T5rTcfE3Y So not a zig specific issue.

Looks like the way to make clang understand this is to add an extra layer of indirection with a volatile pointer. So I guess I should expect the same from zig.

I guess what suprises me the most and looks to be a difference between zig and clang is that the threadlocal seems to always be loaded at the beginning of the function in zig. This still fails in zig:

fn repro() void {

    // We context switch to suspend this execution
    switch_context(&repro_context, &main_context);

    // We load the address after the context switch (but zig actually loads it before)
    const thread2_addr = &thread_local;
    std.log.info("thread 1 addr: {*}", .{thread1_addr});
    std.log.info("Error -> thread 2 addr: {*}", .{thread2_addr});
    workaround();

    // Go back to main to exit the program.
    switch_context(&repro_context, &main_context);
}

Doing the same in clang will generate the expected results. It will not load the threadlocal until after returning from switch_context. So I would arguably say that this part is still a bug.

clobber for asm statements?

global assembly doesn't get clobbers.

might need to be volatile asm?

volatile doesn't mean anything for global assembly.

ziglang / zig