google / wuffs

Wrangling Untrusted File Formats Safely
Other
4.16k stars 131 forks source link

Errors when decoding gzip with many short reads and occasional harvesting of the output buffer #52

Closed joelgibson closed 3 years ago

joelgibson commented 3 years ago

Hi, I've been playing around with the library and come across a strange problem (which could be entirely due to me holding it wrong). It came up in the context of using the gzip decoder during some buffered I/O, I've tried to replicate the conditions in the following C program below (please excuse the whacky constants and sloppy coding while I was trying to get to a minimal example).

The rough conditions for this error are (a) there are short reads, and (b) the destination buffer is sometimes emptied and sometimes not. In the example below, I decode 5 bytes at a time, and on every 3rd loop empty the destination buffer to the console. The decoding runs off the rails and gets the wrong answer (and a bad checksum error at the end). If I empty the destination buffer on every loop instead (or simply reset dst.meta = {0} on every loop), the decode runs correctly. Alternatively, if I leave the destination buffer alone, letting it fill up, the decode runs correctly.

Perhaps I am misunderstanding something about the contract to do with coroutine suspension: is the caller allowed to modify the dst buffer after a short read? My assumption was that while a coroutine was not running, the caller is free to compact buffers as they please. My other assumption was that after a coroutine suspends, it has given away the right to access any of the bytes in the dst buffer again.

Cheers, Joel

#include <stdint.h>
#include <stdio.h>
#define WUFFS_IMPLEMENTATION
#include "wuffs-v0.3.c"

// First 50 Fibonacci numbers, gzipped using Python's gzip module.
uint8_t inbuf[183] = {
  0x1f, 0x8b, 0x08, 0x00, 0x07, 0x22, 0x16, 0x61, 0x02, 0xff, 0x15, 0x8f, 0xc1, 0x11, 0x00, 0x31, 0x08, 0x02, 0xff, 0x54, 0x23, 0x6a, 0x44, 0xfb, 0x6f,
  0xec, 0xb8, 0xc9, 0xcb, 0x59, 0x09, 0x6b, 0x80, 0x7e, 0x89, 0xc2, 0xc3, 0x82, 0x85, 0x24, 0xaa, 0xf1, 0x3c, 0x1d, 0xd8, 0x8d, 0xac, 0x42, 0x49, 0x18,
  0x06, 0x6e, 0x05, 0xbe, 0x13, 0xf2, 0x6d, 0xa3, 0xb9, 0xc4, 0x68, 0x1e, 0x18, 0xd7, 0x03, 0x4a, 0xf4, 0x57, 0x3b, 0x4f, 0xe8, 0xa9, 0x59, 0xe8, 0x45,
  0x9a, 0x26, 0xeb, 0x0a, 0xbc, 0x71, 0x02, 0x45, 0xad, 0xd7, 0x1e, 0x3b, 0xf3, 0xb0, 0x95, 0xd1, 0xe1, 0xde, 0x9e, 0x9c, 0x73, 0xb9, 0xb6, 0xe2, 0x50,
  0x2f, 0xfb, 0x69, 0xf1, 0x14, 0xb9, 0x2e, 0xbd, 0x4c, 0xf5, 0x5f, 0xd4, 0x57, 0x61, 0x88, 0x6c, 0x9a, 0x53, 0xa8, 0x8b, 0x5d, 0x3a, 0x3a, 0xe5, 0xc8,
  0xad, 0x35, 0xc2, 0xca, 0xc6, 0xde, 0x9e, 0xf7, 0x36, 0xd8, 0x96, 0x1a, 0x9d, 0x0b, 0x6f, 0xd0, 0x66, 0xd7, 0x5d, 0x82, 0x4c, 0x62, 0xe5, 0xf3, 0xe8,
  0xfa, 0x0b, 0x8b, 0x59, 0x64, 0x6b, 0x8a, 0xf4, 0x84, 0x3c, 0xd9, 0xfc, 0x85, 0x0a, 0xbd, 0xa1, 0x67, 0x3f, 0x0d, 0x24, 0xad, 0xda, 0xd2, 0x87, 0x0f,
  0x12, 0x69, 0x4a, 0xa8, 0x3b, 0x01, 0x00, 0x00,
};
wuffs_base__io_buffer src = {
  .data = {
    .ptr = inbuf,
    .len = sizeof(inbuf),
  },
  .meta = {0}
};

// "Infinite" output buffer (we won't use much of it).
uint8_t outbuf[10 * 1024] = {0};
wuffs_base__io_buffer dst = {
  .data = {
    .ptr = outbuf,
    .len = sizeof(outbuf),
  },
  .meta = {0}
};

int main(void) {
  uint8_t workbuf[1];
  wuffs_base__status status;

  // Init decoder
  wuffs_gzip__decoder dec;
  status = wuffs_gzip__decoder__initialize(&dec, sizeof dec, WUFFS_VERSION, 0);
  if (!wuffs_base__status__is_ok(&status)) {
    printf("Could not init decoder: %s\n", status.repr);
    return 1;
  }

  // Decode 5 bytes at a time. On every third loop, print the output and empty the dst buffer.
  for (int i = 0; i < sizeof(inbuf); i += 5) {
    if (i >= sizeof(inbuf)) {
      i = sizeof(inbuf);
      src.meta.closed = 1;
    }

    src.meta.wi = i;
    status = wuffs_gzip__decoder__transform_io(&dec, &dst, &src, (wuffs_base__slice_u8){.ptr = workbuf, .len = sizeof(workbuf)});
    if (status.repr != wuffs_base__suspension__short_read) {
      printf("Expecting a short read, was %s\n", status.repr);
      return 1;
    }

    // Change to i % 5 == 0 to have things run correctly.
    if (i % 15 == 0) {
      fwrite(dst.data.ptr + dst.meta.ri, sizeof(uint8_t), dst.meta.wi - dst.meta.ri, stdout);
      dst.meta.ri = dst.meta.wi;
      wuffs_base__io_buffer__compact(&dst);
    }
  }

  // Check whether decoding was successful.
  src.meta.wi = sizeof(inbuf);
  status = wuffs_gzip__decoder__transform_io(&dec, &dst, &src, (wuffs_base__slice_u8){.ptr = workbuf, .len = sizeof(workbuf)});
  if (status.repr != NULL) {
    printf("Decode unsuccessful: %s\n", status.repr);
    return 1;
  }

  return 0;
}
nigeltao commented 3 years ago

You're not holding it wrong. Thanks for the great repro!