apache / parquet-java

Apache Parquet Java
https://parquet.apache.org/
Apache License 2.0
2.49k stars 1.37k forks source link

PARQUET-2468: ParquetMetadata must convert to json #1349

Closed RustedBones closed 1 month ago

RustedBones commented 1 month ago

ParquetMetadata json serialization failed on UNENCRYPTED files.

Add required configuration to the metadata ObjectMaper:

Also add custom serializer for Statistics

{
  "fileMetaData" : {
    "schema" : {
      "name" : "test",
      "repetition" : "REPEATED",
      "logicalTypeAnnotation" : null,
      "id" : null,
      "fields" : [ {
        "name" : "some_null_field",
        "repetition" : "OPTIONAL",
        "logicalTypeAnnotation" : null,
        "id" : null,
        "primitive" : true,
        "primitiveTypeName" : "BINARY",
        "typeLength" : 0,
        "decimalMetadata" : null,
        "originalType" : null
      } ],
      "columns" : [ {
        "path" : [ "some_null_field" ],
        "type" : "BINARY",
        "primitiveType" : {
          "name" : "some_null_field",
          "repetition" : "OPTIONAL",
          "logicalTypeAnnotation" : null,
          "id" : null,
          "primitive" : true,
          "primitiveTypeName" : "BINARY",
          "typeLength" : 0,
          "decimalMetadata" : null,
          "originalType" : null
        },
        "maxRepetitionLevel" : 0,
        "maxDefinitionLevel" : 1,
        "typeLength" : 0
      } ],
      "paths" : [ [ "some_null_field" ] ],
      "fieldCount" : 1,
      "primitive" : false,
      "originalType" : null
    },
    "keyValueMetaData" : { },
    "createdBy" : null,
    "fileDecryptor" : null,
    "encryptionType" : "UNENCRYPTED"
  },
  "blocks" : [ {
    "columns" : [ {
      "rowGroupOrdinal" : -1,
      "encodingStats" : null,
      "columnIndexReference" : null,
      "offsetIndexReference" : null,
      "bloomFilterOffset" : -1,
      "bloomFilterLength" : -1,
      "dictionaryPageOffset" : 0,
      "valueCount" : 0,
      "totalSize" : 0,
      "totalUncompressedSize" : 0,
      "statistics" : {
        "min" : "\u0000",
        "max" : "\u0000\u0000",
        "null_count" : 0
      },
      "sizeStatistics" : null,
      "firstDataPageOffset" : 0,
      "primitiveType" : {
        "name" : "fake_type",
        "repetition" : "OPTIONAL",
        "logicalTypeAnnotation" : null,
        "id" : null,
        "primitive" : true,
        "primitiveTypeName" : "BINARY",
        "typeLength" : 0,
        "decimalMetadata" : null,
        "originalType" : null
      },
      "startingPos" : 0,
      "codec" : "GZIP",
      "encodings" : [ ],
      "encrypted" : false,
      "type" : "BINARY",
      "path" : [ "foo" ]
    } ],
    "rowCount" : 0,
    "totalByteSize" : 0,
    "path" : null,
    "ordinal" : 0,
    "rowIndexOffset" : -1,
    "compressedSize" : 0,
    "startingPos" : 0
  } ]
}
wgtmac commented 1 month ago

Thanks for the reply! @Fokko

It seems that we need a minor release for this. +1 on my side.