Cloudef / zig-aio

io_uring like asynchronous API and coroutine powered IO tasks for zig
https://cloudef.github.io/zig-aio/
MIT License
175 stars 6 forks source link

Inconsistency in coro io.Timeout, sometimes it pauses execution indefinitely #22

Closed Bob-van closed 2 weeks ago

Bob-van commented 3 weeks ago

I have been experimenting with the coro backend. I wrote some basic tests and encountered one that behaves weird. It is non-deterministic, which is expected to a async backed, but this inconsistency is really interesting. The weird thing is that it sometimes works, and many times just fails in numerous ways. I could be using it wrong, but don't think i am this time.

Compilation is on 0.13 branch, with windows as platform (haven't tried elsewhere yet), tested in both Debug and Release builds. Ran at std.Thread.getCpuCount() = 16

const std = @import("std");
const aio = @import("aio");
const coro = @import("coro");

var scheduler: coro.Scheduler = undefined;

fn protection(comptime func: anytype, args: anytype) void {
    //i know the task can capture error, this is for ease of debugging
    @call(.always_inline, func, args) catch |err| {
        logErr("Task FAILED with: {s}\n", .{@errorName(err)});
    };
}

pub fn logInfo(comptime format: []const u8, args: anytype) void {
    std.io.getStdOut().writer().print(format, args) catch return;
}

pub fn logErr(comptime format: []const u8, args: anytype) void {
    std.io.getStdErr().writer().print(format, args) catch return;
}

pub fn delay(ns: u128) !void {
    try coro.io.single(aio.Timeout{ .ns = ns });
}

pub fn main() !void {
    logInfo("\nStarted!\n", .{});

    var gpa = std.heap.GeneralPurposeAllocator(.{ .thread_safe = true }){};
    const allocator = gpa.allocator();
    defer _ = gpa.deinit();

    const tmp: []usize = try allocator.alloc(usize, 500);
    defer allocator.free(tmp);

    //in theory not necessary, but for the good meassure
    for (tmp) |*index| {
        index.* = 0;
    }

    scheduler = try coro.Scheduler.init(allocator, .{});
    defer scheduler.deinit();

    _ = try scheduler.spawn(protection, .{ spawner, .{tmp} }, .{});

    try scheduler.run(.wait);

    logInfo("\nProgram finished correctly!\n", .{});
}

pub fn spawner(tmp: []usize) !void {
    for (0..500) |i| {
        _ = try scheduler.spawn(protection, .{ logger, .{ i, tmp } }, .{});
    }
    try delay(std.time.ns_per_s * 25); // logging takes 20s + 5s for safety
    for (tmp, 0..) |value, index| {
        logInfo("{}. {}\n", .{ index + 1, value });
    }
    //just test for protection layer
    return error.Unexpected;
}

pub fn logger(index: usize, tmp: []usize) !void {
    for (0..100) |i| {
        try delay(std.time.ns_per_ms * 200);
        tmp[index] = i + 1;
    }
    logInfo("{}. Finished\n", .{index + 1});
}
steeve commented 3 weeks ago

I actually came here to report that same problem, here is my debug log:

debug(coro): spawned: 10094c190:active
debug(coro): spawned: 100a50190:active
TASK 1 BEGIN
TASK 1 END
debug(coro): yielding: 100a50190:completed
debug(coro): spawned: 100b54190:active
TASK 2 BEGIN
debug(aio_uringlator): queue: 0: aio.ops.Operation.timeout, unlinked (null)
debug(coro): yielding: 100b54190:io
debug(coro): complete: 100a50190:completed, wait
debug(coro): deinit: 100a50190:completed
COPCOUCOU asava 64
debug(coro): complete: 100b54190:io, wait
debug(coro): yielding: 10094c190:waiting_frame
debug(aio_uringlator): perform: 0: aio.ops.Operation.timeout
debug(aio_uringlator): finish: 0 error.Success
debug(aio_uringlator): complete: 0: aio.ops.Operation.timeout [OK]
debug(coro): waking up: 100b54190:io
TASK 2 END
debug(coro): yielding: 100b54190:completed

tasks:

pub fn sleep(ns: u64) void {
    coro.io.single(aio.Timeout{
        .ns = @intCast(ns),
    }) catch @panic("Unable to sleep");
}
....
    var frame = try asyncc(struct {
        fn call(i: i32) i32 {
            log("TASK 1 BEGIN\n", .{});
            defer log("TASK 1 END\n", .{});
            sleep(1 * std.time.ns_per_s);
            return 42 + i;
        }
    }.call, .{22});

    var frame2 = try asyncc(struct {
        fn call(i: i32) i32 {
            log("TASK 2 BEGIN\n", .{});
            defer log("TASK 2 END\n", .{});
            sleep(1 * std.time.ns_per_s);
            return 42 + i;
        }
    }.call, .{22});
steeve commented 3 weeks ago

Actually, I got the same problem by spawning 10 coroutines, without sleeping, just doing IO through coro.io

Bob-van commented 3 weeks ago

Can you specify your platform and branch, so we can know whether its platform / branch specific?

steeve commented 3 weeks ago

Here is a small repro, on current master (39706e46d60886ce53b24a8d5c35746e98e614df):

const std = @import("std");
const aio = @import("aio");
const coro = @import("coro");

pub fn sleep(ns: u64) void {
    coro.io.single(aio.Timeout{
        .ns = @intCast(ns),
    }) catch @panic("Unable to sleep");
}

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    defer _ = gpa.deinit();

    var scheduler = try coro.Scheduler.init(gpa.allocator(), .{});
    defer scheduler.deinit();
    var pool = try coro.ThreadPool.init(gpa.allocator(), .{});
    defer pool.deinit();

    _ = try scheduler.spawn(asyncMain, .{&scheduler}, .{});
    try scheduler.run(.wait);
}

pub fn asyncMain(scheduler: *coro.Scheduler) !void {
    std.debug.print("begin asyncMain\n", .{});
    defer std.debug.print("end asyncMain\n", .{});
    var frame = try scheduler.spawn(sleep, .{1 * std.time.ns_per_ms}, .{});
    std.debug.print("frame.isComplete = {any}\n", .{frame.isComplete()});
    frame.complete(.wait);
}

It hangs on the first frame.complete:

$ ~/c/g/z/zig-aio (master)> ./zig/zig build test
begin asyncMain
frame.isComplete = false
steeve commented 3 weeks ago

With debug:

~/c/g/z/zig-aio (master)> ./zig/zig build pouet
debug(coro): spawned: 102c4c190:active
begin asyncMain
debug(coro): spawned: 102d50190:active
debug(aio_uringlator): queue: 0: aio.ops.Operation.timeout, unlinked (null)
debug(coro): yielding: 102d50190:io
frame.isComplete = false
debug(coro): complete: 102d50190:io, wait
debug(coro): yielding: 102c4c190:waiting_frame
debug(aio_uringlator): perform: 0: aio.ops.Operation.timeout
debug(aio_uringlator): finish: 0 error.Success
debug(aio_uringlator): complete: 0: aio.ops.Operation.timeout [OK]
debug(coro): waking up: 102d50190:io
debug(coro): yielding: 102d50190:completed
Cloudef commented 2 weeks ago

Thanks for the sample, I'll check it out

steeve commented 2 weeks ago

Note that this is not specific to aio.Timeout in my experience. I had it append with aio.Write, too.

Cloudef commented 2 weeks ago

Can you try the io_uring backend?

steeve commented 2 weeks ago

I did, same

Bob-van commented 2 weeks ago

I have tested my "example code" on linux posix and io_uring implementations. Compilation is on 0.13 branch, tested in both Debug and Release builds. Behaves the same . . .

Bob-van commented 2 weeks ago

So to sum it up, the "bug" appears in both master and 0.13 branch on all platforms regardless of optimalizations.

steeve commented 2 weeks ago

Looking at the code, I'm seeing that Frame.complete will add the current coroutine to the waiters of the target frame:

pub fn complete(self: *@This(), mode: CompleteMode, comptime Result: type) Result {
    std.debug.assert(!self.canceled);
    debug("complete: {}, {s}", .{ self, @tagName(mode) });

    if (current()) |frame| {
        while (self.status != .completed) {
            if (mode == .cancel and tryCancel(self)) break;
            self.waiters.prepend(&frame.wait_link);
            defer self.waiters.remove(&frame.wait_link);
            yield(.waiting_frame);
        }

Waiters are then woken up via Frame.wakeupWaiters, but I only see it called in the deinit, which is never called because the current coroutine is yielded in a .waiting_frame status. So Frame.deinit is never called.

Digging further.

steeve commented 2 weeks ago

From what I could gather so far:

Manually calling frame.wakeUpWaiters in Frame.yield does make the calling coroutine continue, but then I get a crash when the root coroutine finishes

Cloudef commented 2 weeks ago

I'm currently very busy with real life stuff, but I should be able to take a look at least during weekend.

steeve commented 2 weeks ago

I'm currently very busy with real life stuff, but I should be able to take a look at least during weekend.

No worries take your time! Thank you for this great project

Cloudef commented 2 weeks ago

I've identified the issue. While adding some tests I found some other weirdness with io_uring that I'm currently investigating.

Cloudef commented 1 week ago

@Bob-van I moved your original example to #31