webrecorder / warcio.js

JS Streaming WARC IO optimized for Browser and Node
MIT License
30 stars 6 forks source link

Parsing fails with TypeError [ERR_INVALID_ARG_TYPE]: The "input" argument must be an instance of ArrayBuffer or ArrayBufferView. Received null #33

Closed phiresky closed 2 years ago

phiresky commented 2 years ago

With the file fr-base.warc from your fr.wacz file, the following code:

const { WARCParser } = require("warcio");
const fs = require("fs");

const b = fs.createReadStream("fr-base.warc");

async function test() {
  for await (const x of new WARCParser(b)) {
    console.log(x);
    await x.readFully(true);
  }
}

test();

fails with

WARCRecord {
  warcHeaders: StatusAndHeaders {
    statusline: 'WARC/1.0',
    headers: Headers { [Symbol(map)]: [Object: null prototype] }
  },
  _reader: LimitReader {
    sourceIter: AsyncIterReader {
      compressed: 'gzip',
      opts: [Object],
      inflator: [NoConcatInflator],
      _sourceIter: [Object [AsyncGenerator]],
      lastValue: <Buffer 1f 8b 08 00 00 00 00 00 02 13 b5 53 5d 6f da 30 14 7d 47 e2 3f 58 7d d8 c3 86 c1 90 0f a8 3b 36 31 be 86 4a 4a 21 a9 5a f5 cd 75 9c 60 2d d8 99 ed d0 ... 35581 more bytes>,
      errored: false,
      _savedChunk: [Uint8Array],
      _rawOffset: 1013665,
      _readOffset: 2320073,
      numChunks: 229
    },
    length: 730,
    limit: 0,
    skip: 0
  },
  _contentReader: null,
  payload: null,
  httpHeaders: StatusAndHeaders {
    statusline: 'HTTP/1.1 200 OK',
    headers: Headers { [Symbol(map)]: [Object: null prototype] }
  },
  consumed: false
}
node:internal/errors:464
    ErrorCaptureStackTrace(err);
    ^

TypeError [ERR_INVALID_ARG_TYPE]: The "input" argument must be an instance of ArrayBuffer or ArrayBufferView. Received null
    at new NodeError (node:internal/errors:371:5)
    at TextDecoder.decode (node:internal/encoding:413:15)
    at AsyncIterReader.dechunk (./node_modules/warcio/src/readers.js:153:31)
    at processTicksAndRejections (node:internal/process/task_queues:96:5)
    at async AsyncIterReader._loadNext (./node_modules/warcio/src/readers.js:139:17)
    at async AsyncIterReader._next (./node_modules/warcio/src/readers.js:231:17)
    at async AsyncIterReader.[Symbol.asyncIterator] (./node_modules/warcio/src/readers.js:293:21)
    at async WARCRecord.[Symbol.asyncIterator] (./node_modules/warcio/src/warcrecord.js:239:22)
    at async Function.readFully (./node_modules/warcio/src/readers.js:53:22)
    at async WARCRecord.readFully (./node_modules/warcio/src/warcrecord.js:181:22) {
  code: 'ERR_INVALID_ARG_TYPE'
}

When using await x.readFully(false); instead of true, it doesn't fail