ziglang / zig

General-purpose programming language and toolchain for maintaining robust, optimal, and reusable software.
https://ziglang.org
MIT License
33.61k stars 2.46k forks source link

intermittent TlsConnectionTruncated #14573

Open marler8997 opened 1 year ago

marler8997 commented 1 year ago

Zig Version

0.11.0-dev.1507+6f13a725a

Steps to Reproduce and Observed Behavior

I found this intermittent issue when trying to use the new std TLS implementation for zigup. I've created a small application that will intermittently reproduce the issue. It's extremely intermittent. Once I had 30 successful runs before I saw it, but usually I see it before 10 runs, sometimes after the first few runs as well. Seems to happen on CI pretty often though (see https://github.com/marler8997/zigup/actions/runs/4095377532/jobs/7066880134), it's odd. It also seems to be sensitive to the "read buffer size".

Here's the program:

const std = @import("std");

// play around with this size to see different kinds of errors
var buf: [8192]u8 = undefined;

pub fn main() !void {
    const host = "ziglang.org";
    const port = 443;
    const resource = "https://ziglang.org/download/0.7.0/zig-linux-x86_64-0.7.0.tar.xz";
    const content_len = 37154432; // hardcoded, always the same

    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);

    const stream = try std.net.tcpConnectToHost(arena.allocator(), host, port);
    defer stream.close();

    var ca_bundle = std.crypto.Certificate.Bundle{ };
    defer ca_bundle.deinit(arena.allocator());

    try ca_bundle.rescan(arena.allocator());

    var client = try std.crypto.tls.Client.init(stream, ca_bundle, host);

    const request = try std.fmt.bufPrint(
        &buf,
           "GET {s} HTTP/1.1\r\n"
        ++ "Host: {s}\r\n"
        ++ "Connection: close\r\n"
        ++ "\r\n",
        .{ resource, host});
    std.log.info("sending request\n----\n{s}----\n", .{request});
    try client.writeAll(stream, request);

    var total_read: usize = 0;
    const end_of_headers = blk: {
        while (true) {
            const len = try client.read(stream, buf[total_read..]);
            std.debug.assert(len != 0);
            total_read += len;

            if (std.mem.indexOf(u8, buf[0 .. total_read], "\r\n\r\n")) |eoh| break :blk eoh;
        }
    };
    std.log.info("received headers\n----\n{s}\n----\n", .{buf[0 .. end_of_headers]});

    const total_expected = end_of_headers + 4 + content_len;
    while (true) {
        const len = try client.read(stream, &buf);
        if (len == 0)
            break;
        std.log.info("got {} bytes", .{len});
        total_read += len;
    }

    if (total_read != total_expected) {
        std.log.err("expected {} bytes but got {}", .{total_expected, total_read});
        std.os.exit(0xff);
    }
    std.log.info("Success!", .{});
}

This program sends a simple HTTP request to download the zig 0.7.0 linux tarball release. A non-successful run (can take running a bunch of times to see) looks like this:

$ zig build-exe tlsbug.zig
$ ./tlsbug
info: sending request
----
GET https://ziglang.org/download/0.7.0/zig-linux-x86_64-0.7.0.tar.xz HTTP/1.1
Host: ziglang.org
Connection: close

----

info: received headers
----
HTTP/1.1 200 OK
Content-Type: application/x-xz
Content-Length: 37154432
Connection: close
Date: Fri, 03 Feb 2023 23:12:59 GMT
x-amz-meta-s3cmd-attrs: atime:1604890484/ctime:1604890546/gid:100/gname:users/md5:b29ab9c96c8f7963b36e11511f75447a/mode:33188/mtime:1604890546/uid:1000/uname:andy
Cache-Control: public, max-age=31536000, immutable
Last-Modified: Mon, 09 Nov 2020 03:18:34 GMT
ETag: "97e705bb6119a7c60a3e589506224e19-3"
Server: AmazonS3
X-Cache: Hit from cloudfront
Via: 1.1 b2d3922a177f6cecf9222a78a0a1ad32.cloudfront.net (CloudFront)
X-Amz-Cf-Pop: SEA19-C3
X-Amz-Cf-Id: Hok7dxOfwRHyeB9jTuDGMpV01C1wn9T-29QGVKGvBajduqZa46XlNA==
Age: 162960
----

info: got 8192 bytes
[ ... snipped a bunch of duplicate lines ... ]
info: got 8192 bytes
error: TlsConnectionTruncated
/home/marler8997/zig/0.11.0-dev.1507+6f13a725a/files/lib/std/crypto/tls/Client.zig:917:13: 0x3670f8 in readvAdvanced__anon_7479 (tlsbug)
            return error.TlsConnectionTruncated;
            ^
/home/marler8997/zig/0.11.0-dev.1507+6f13a725a/files/lib/std/crypto/tls/Client.zig:831:19: 0x36645b in readvAtLeast__anon_7478 (tlsbug)
        var amt = try c.readvAdvanced(stream, iovecs[vec_i..]);
                  ^
/home/marler8997/zig/0.11.0-dev.1507+6f13a725a/files/lib/std/crypto/tls/Client.zig:792:5: 0x36623f in readAtLeast__anon_7476 (tlsbug)
    return readvAtLeast(c, stream, &iovecs, len);
    ^
/home/marler8997/zig/0.11.0-dev.1507+6f13a725a/files/lib/std/crypto/tls/Client.zig:797:5: 0x3661c1 in read__anon_7475 (tlsbug)
    return readAtLeast(c, stream, buffer, 1);
    ^
/home/marler8997/git/ziget/tlsbug.zig:47:21: 0x36fd12 in main (tlsbug)
        const len = try client.read(stream, &buf);
                    ^

If I change the buffer size to 8192 * 10, then I get a different error:

$ ./tlsbug
[ .. output snipped ... ]
info: got 81920 bytes
thread 3123151 panic: reached unreachable code
/home/marler8997/zig/0.11.0-dev.1507+6f13a725a/files/lib/std/debug.zig:281:14: 0x308b9c in assert (tlsbug)
    if (!ok) unreachable; // assertion failure
             ^
/home/marler8997/zig/0.11.0-dev.1507+6f13a725a/files/lib/std/mem.zig:203:30: 0x301f57 in copy__anon_4376 (tlsbug)
    assert(dest.len >= source.len);
                             ^
/home/marler8997/zig/0.11.0-dev.1507+6f13a725a/files/lib/std/crypto/tls/Client.zig:993:30: 0x368964 in readvAdvanced__anon_7479 (tlsbug)
            mem.copy(u8, frag[0..in], first);
                             ^
/home/marler8997/zig/0.11.0-dev.1507+6f13a725a/files/lib/std/crypto/tls/Client.zig:831:53: 0x36643a in readvAtLeast__anon_7478 (tlsbug)
        var amt = try c.readvAdvanced(stream, iovecs[vec_i..]);
                                                    ^
/home/marler8997/zig/0.11.0-dev.1507+6f13a725a/files/lib/std/crypto/tls/Client.zig:792:24: 0x366212 in readAtLeast__anon_7476 (tlsbug)
    return readvAtLeast(c, stream, &iovecs, len);
                       ^
/home/marler8997/zig/0.11.0-dev.1507+6f13a725a/files/lib/std/crypto/tls/Client.zig:797:23: 0x366194 in read__anon_7475 (tlsbug)
    return readAtLeast(c, stream, buffer, 1);
                      ^
/home/marler8997/git/ziget/tlsbug.zig:48:36: 0x36fcc8 in main (tlsbug)
        const len = try client.read(stream, &buf);
                                   ^
/home/marler8997/zig/0.11.0-dev.1507+6f13a725a/files/lib/std/start.zig:616:37: 0x2ff1a9 in posixCallMainAndExit (tlsbug)
            const result = root.main() catch |err| {
                                    ^
/home/marler8997/zig/0.11.0-dev.1507+6f13a725a/files/lib/std/start.zig:376:5: 0x2fec11 in _start (tlsbug)
    @call(.never_inline, posixCallMainAndExit, .{});
    ^
Aborted (core dumped)

Another thing to note is that in a wireshark trace, nothing coming from the server seems "out of the ordinary" as far as I can see. The server isn't trying to close the connection, it seems to just be sending the data and then suddenly the client closes the connection with "RST" packets.

Expected Behavior

The commands above to build and then run tlsbug.zig should end with output that looks like this:

[ ... output snipped ... ]
info: got 81920 bytes
info: got 61722 bytes
info: Success!

The program shouldn't intermittently fail.

hryx commented 1 year ago

On 0.12.0-dev.5+fc6e57568 (macOS x64_64) I see a out-of-bounds panic instead of an error:

----

info: got 8192 bytes
info: got 8192 bytes
info: got 8192 bytes
thread 4669890 panic: index out of bounds: index 18427, len 16645
/Users/stevie/bin/lib/std/crypto/tls/Client.zig:1251:81: 0x1048d390a in finishRead2 (main)
        @memcpy(c.partially_read_buffer[c.partial_ciphertext_idx + first.len ..][0..frag1.len], frag1);
                                                                                ^
/Users/stevie/bin/lib/std/crypto/tls/Client.zig:1071:35: 0x1048cdb92 in readvAdvanced__anon_4373 (main)
                return finishRead2(c, first, frag1, vp.total);
                                  ^
/Users/stevie/bin/lib/std/crypto/tls/Client.zig:900:38: 0x1048d98b5 in readvAtLeast__anon_4372 (main)
        var amt = try c.readvAdvanced(stream, iovecs[vec_i..]);
                                     ^
/Users/stevie/bin/lib/std/crypto/tls/Client.zig:861:24: 0x1048d9d70 in readAtLeast__anon_4370 (main)
    return readvAtLeast(c, stream, &iovecs, len);
                       ^
/Users/stevie/bin/lib/std/crypto/tls/Client.zig:866:23: 0x1048d9e42 in read__anon_4369 (main)
    return readAtLeast(c, stream, buffer, 1);
                      ^
/Users/stevie/tmp/14573/main.zig:46:36: 0x1048da6d0 in main (main)
        const len = try client.read(stream, &buf);
                                   ^
/Users/stevie/bin/lib/std/start.zig:574:37: 0x104987300 in main (main)
            const result = root.main() catch |err| {

The panic is consistent for me, but the number of "info: got XXX bytes" logs that happens before the panic varies wildly between runs. I'm seeing everything between zero and 45 read chunks before the failure.

With double the read buffer size (8192 * 2), the request succeeds consistently. With anything smaller I get the above.

Parzival-3141 commented 12 months ago

I'm also experiencing the issue @hryx mentioned on macOS ARM M2 0.12.0-dev.294+4d1432299 This issue is a bit over my head, but I've narrowed it down a bit. The repro seems to always fail in this call to finishRead2() https://github.com/ziglang/zig/blob/d2014fe9713794f6cc0830301a1110d5e92d0ff0/lib/std/crypto/tls/Client.zig#L1066-L1072

frag1 is too large here (line 1251), causing the out-of-bounds error. It seems to grow before every invocation. https://github.com/ziglang/zig/blob/d2014fe9713794f6cc0830301a1110d5e92d0ff0/lib/std/crypto/tls/Client.zig#L1245-L1251

And it's set here https://github.com/ziglang/zig/blob/d2014fe9713794f6cc0830301a1110d5e92d0ff0/lib/std/crypto/tls/Client.zig#L1001-L1010

ianic commented 1 month ago

Duplicate of #15226, fixed by #20587. Can be closed now.

marler8997 commented 1 month ago

I just tried this on 0.13.0, set the buffer size to 300 and I got a panic:

info: sending request
----
GET https://ziglang.org/download/0.7.0/zig-linux-x86_64-0.7.0.tar.xz HTTP/1.1
Host: ziglang.org
Connection: close

----

thread 16080 panic: index out of bounds: index 1, len 1
C:\tools\zig\0.13.0\files\lib\std\crypto\tls\Client.zig:931:29: 0xcb55e1 in readvAtLeast__anon_5018 (bug.exe.obj)
        while (amt >= iovecs[vec_i].len) {
                            ^
C:\tools\zig\0.13.0\files\lib\std\crypto\tls\Client.zig:889:24: 0xcb5881 in readAtLeast__anon_5016 (bug.exe.obj)
    return readvAtLeast(c, stream, &iovecs, len);
                       ^
C:\tools\zig\0.13.0\files\lib\std\crypto\tls\Client.zig:894:23: 0xcb5931 in read__anon_5015 (bug.exe.obj)
    return readAtLeast(c, stream, buffer, 1);
                      ^
C:\temp\bug.zig:38:40: 0xcb5da4 in main (bug.exe.obj)
            const len = try client.read(stream, buf[total_read..]);
                                       ^
C:\tools\zig\0.13.0\files\lib\std\start.zig:363:53: 0xd60d7c in WinStartup (bug.exe.obj)
    std.os.windows.ntdll.RtlExitUserProcess(callMain());
                                                    ^
???:?:?: 0x7ffa234a257c in ??? (KERNEL32.DLL)
???:?:?: 0x7ffa24b0af27 in ??? (ntdll.dll)

The TLS implementation is pretty riddled with bugs atm, I noticed multiple when I was working on fixing this.

ianic commented 1 month ago

In this case, buffer size 300, example from the first post in this thread, in line:

C:\temp\bug.zig:38:40: 0xcb5da4 in main (bug.exe.obj)
            const len = try client.read(stream, buf[total_read..]);

read is called with zero size buffer. total_read is already 300. read in tls.Client is not guarded against calling with zero size buffer. If I make buf large enough to get all headers (666 in this case) then zig master succeeds and 0.13 fails with the panic fixed in #20587.