ironSource / parquetjs

fully asynchronous, pure JavaScript implementation of the Parquet file format
MIT License
349 stars 176 forks source link

Problems with reader and deep schemas #42

Open ZJONSSON opened 6 years ago

ZJONSSON commented 6 years ago

Here is an example of a schema that is three levels deep. Shreading and Materializing a single record works fine however writing a parquet file and reading it back results in an error:

const parquet = require('parquetjs');

var schema = new parquet.ParquetSchema({
  a: {
    fields: {
      b: {
        fields: {
          c:  {type: 'UTF8'}
        }
      }
    }
  }
});

let rec = {a: {b: {c: 'this is a test'}}};

async function main() {
  // shread & materialize:
  console.log('shread & materialize:');
  let buf = {};
  parquet.ParquetShredder.shredRecord(schema, rec, buf);
  console.log(parquet.ParquetShredder.materializeRecords(schema, buf));

  // writer and reader
  console.log('writer & reader:');
  const writer = await parquet.ParquetWriter.openFile(schema, 'test.parquet');
  await writer.appendRow(rec);
  await writer.close();

  let reader = await parquet.ParquetReader.openFile('test.parquet');
  let cursor = reader.getCursor();
  let record = null;
  while (record = await cursor.next()) {
    console.log(record);
  }

  await reader.close();
}

main().then(console.log,console.log)

Output is:

shread & materialize:
[ { a: { b: [Object] } } ]
writer & reader:
TypeError: Cannot read property 'rLevelMax' of undefined
    at ParquetEnvelopeReader.readColumnChunk (/home/zjonsson/git/parquetjs/lib/reader.js:344:24)
    at <anonymous>
ZJONSSON commented 6 years ago

The problem seems to be with how the reader reconstructs the schema from the parquet file.

If I log the original fields from the schema (i.e. console.log(schema.fields)), I get:

{
  "a": {
    "name": "a",
    "path": [
      "a"
    ],
    "repetitionType": "REQUIRED",
    "rLevelMax": 0,
    "dLevelMax": 0,
    "isNested": true,
    "fieldCount": 1,
    "fields": {
      "b": {
        "name": "b",
        "path": [
          "a",
          "b"
        ],
        "repetitionType": "REQUIRED",
        "rLevelMax": 0,
        "dLevelMax": 0,
        "isNested": true,
        "fieldCount": 1,
        "fields": {
          "c": {
            "name": "c",
            "path": [
              "a",
              "b",
              "c"
            ],
            "repetitionType": "REQUIRED",
            "rLevelMax": 0,
            "dLevelMax": 0,
            "isNested": true,
            "fieldCount": 1,
            "fields": {
              "d": {
                "name": "d",
                "primitiveType": "BYTE_ARRAY",
                "originalType": "UTF8",
                "path": [
                  "a",
                  "b",
                  "c",
                  "d"
                ],
                "repetitionType": "REQUIRED",
                "encoding": "PLAIN",
                "compression": "UNCOMPRESSED",
                "rLevelMax": 0,
                "dLevelMax": 0
...

However if I look at the schema created by the reader (i.e. console.log(reader.schema.fields)) I get:

{
  "a": {
    "name": "a",
    "path": [
      "a"
    ],
    "repetitionType": "REQUIRED",
    "rLevelMax": 0,
    "dLevelMax": 0,
    "isNested": true,
    "fieldCount": 1,
    "fields": {
      "b": {
        "name": "b",
        "path": [
          "a",
          "b"
        ],
        "repetitionType": "REQUIRED",
        "rLevelMax": 0,
        "dLevelMax": 0,
        "isNested": true,
        "fieldCount": 0,
        "fields": {}
      }
    }
  },
  "c": {
    "name": "c",
    "path": [
      "c"
    ],
    "repetitionType": "REQUIRED",
    "rLevelMax": 0,
    "dLevelMax": 0,
    "isNested": true,
    "fieldCount": 1,
    "fields": {
      "d": {
        "name": "d",
        "primitiveType": "BYTE_ARRAY",
        "originalType": "UTF8",
        "path": [
          "c",
          "d"
        ],
        "repetitionType": "REQUIRED",
        "encoding": "PLAIN",
        "compression": "UNCOMPRESSED",
        "rLevelMax": 0,
        "dLevelMax": 0
      }
    }
  }
}
mlevkovsky commented 5 years ago

same here for me. As a note this error is only happening when you read the whole row cursor.next() If you pass in the columns you want this error doesn't happen