Closed noxify closed 11 months ago
and here it is: https://github.com/noxify/parquetjs_bug
I was able to get the same error via parquet-tools
.
I tried to document all steps to make it reproducible.
If you need more information or something else, please let me know.
I spent some minutes to start a short debug session to check if the method for writeIndex
is called with my current setup ( to make sure that's not an user problem ;) ) - For me, everything looks good, you can find below the logging responses.
Update 1:
I added some console.log
inside the writeIndex
method and column.meta_data?.columnIndex
is filled.
// change
{
columnMeta: {
type: 6,
encodings: [ 3, 0 ],
path_in_schema: [ 'change_id' ],
codec: 0,
num_values: { buffer: , offset: 0 },
total_uncompressed_size: { buffer: , offset: 0 },
total_compressed_size: { buffer: , offset: 0 },
key_value_metadata: null,
data_page_offset: { buffer: , offset: 0 },
index_page_offset: null,
dictionary_page_offset: null,
statistics: {
max: ,
min: ,
null_count: [Object],
distinct_count: [Object],
max_value: ,
min_value:
},
encoding_stats: null,
bloom_filter_offset: null,
offsetIndex: { page_locations: [Array] },
columnIndex: {
null_pages: null,
min_values: [Array],
max_values: [Array],
boundary_order: null,
null_counts: null
}
},
columnIndex: {
null_pages: null,
min_values: [ ],
max_values: [ ],
boundary_order: null,
null_counts: null
}
}
{
columnMeta: {
type: 6,
encodings: [ 3, 0 ],
path_in_schema: [ 'status' ],
codec: 0,
num_values: { buffer: , offset: 0 },
total_uncompressed_size: { buffer: , offset: 0 },
total_compressed_size: { buffer: , offset: 0 },
key_value_metadata: null,
data_page_offset: { buffer: , offset: 0 },
index_page_offset: null,
dictionary_page_offset: null,
statistics: {
max: ,
min: ,
null_count: [Object],
distinct_count: [Object],
max_value: ,
min_value:
},
encoding_stats: null,
bloom_filter_offset: null,
offsetIndex: { page_locations: [Array] },
columnIndex: {
null_pages: null,
min_values: [Array],
max_values: [Array],
boundary_order: null,
null_counts: null
}
},
columnIndex: {
null_pages: null,
min_values: [ ],
max_values: [ ],
boundary_order: null,
null_counts: null
}
}
{
columnMeta: {
type: 6,
encodings: [ 3, 0 ],
path_in_schema: [ 'approval_status' ],
codec: 0,
num_values: { buffer: , offset: 0 },
total_uncompressed_size: { buffer: , offset: 0 },
total_compressed_size: { buffer: , offset: 0 },
key_value_metadata: null,
data_page_offset: { buffer: , offset: 0 },
index_page_offset: null,
dictionary_page_offset: null,
statistics: {
max: ,
min: ,
null_count: [Object],
distinct_count: [Object],
max_value: ,
min_value:
},
encoding_stats: null,
bloom_filter_offset: null,
offsetIndex: { page_locations: [Array] },
columnIndex: {
null_pages: null,
min_values: [Array],
max_values: [Array],
boundary_order: null,
null_counts: null
}
},
columnIndex: {
null_pages: null,
min_values: [ ],
max_values: [ ],
boundary_order: null,
null_counts: null
}
}
// change_activity
{
columnMeta: {
type: 6,
encodings: [ 3, 0 ],
path_in_schema: [ 'change_id' ],
codec: 0,
num_values: { buffer: , offset: 0 },
total_uncompressed_size: { buffer: , offset: 0 },
total_compressed_size: { buffer: , offset: 0 },
key_value_metadata: null,
data_page_offset: { buffer: , offset: 0 },
index_page_offset: null,
dictionary_page_offset: null,
statistics: {
max: ,
min: ,
null_count: [Object],
distinct_count: [Object],
max_value: ,
min_value:
},
encoding_stats: null,
bloom_filter_offset: null,
offsetIndex: { page_locations: [Array] },
columnIndex: {
null_pages: null,
min_values: [Array],
max_values: [Array],
boundary_order: null,
null_counts: null
}
},
columnIndex: {
null_pages: null,
min_values: [ ],
max_values: [ ],
boundary_order: null,
null_counts: null
}
}
{
columnMeta: {
type: 6,
encodings: [ 3, 0 ],
path_in_schema: [ 'type' ],
codec: 0,
num_values: { buffer: , offset: 0 },
total_uncompressed_size: { buffer: , offset: 0 },
total_compressed_size: { buffer: , offset: 0 },
key_value_metadata: null,
data_page_offset: { buffer: , offset: 0 },
index_page_offset: null,
dictionary_page_offset: null,
statistics: {
max: ,
min: ,
null_count: [Object],
distinct_count: [Object],
max_value: ,
min_value:
},
encoding_stats: null,
bloom_filter_offset: null,
offsetIndex: { page_locations: [Array] },
columnIndex: {
null_pages: null,
min_values: [Array],
max_values: [Array],
boundary_order: null,
null_counts: null
}
},
columnIndex: {
null_pages: null,
min_values: [ ],
max_values: [ ],
boundary_order: null,
null_counts: null
}
}
{
columnMeta: {
type: 6,
encodings: [ 3, 0 ],
path_in_schema: [ 'description' ],
codec: 0,
num_values: { buffer: , offset: 0 },
total_uncompressed_size: { buffer: , offset: 0 },
total_compressed_size: { buffer: , offset: 0 },
key_value_metadata: null,
data_page_offset: { buffer: , offset: 0 },
index_page_offset: null,
dictionary_page_offset: null,
statistics: {
max: ,
min: ,
null_count: [Object],
distinct_count: [Object],
max_value: ,
min_value:
},
encoding_stats: null,
bloom_filter_offset: null,
offsetIndex: { page_locations: [Array] },
columnIndex: {
null_pages: null,
min_values: [Array],
max_values: [Array],
boundary_order: null,
null_counts: null
}
},
columnIndex: {
null_pages: null,
min_values: [
],
max_values: [
],
boundary_order: null,
null_counts: null
}
}
{
columnMeta: {
type: 6,
encodings: [ 3, 0 ],
path_in_schema: [ 'thenumber' ],
codec: 0,
num_values: { buffer: , offset: 0 },
total_uncompressed_size: { buffer: , offset: 0 },
total_compressed_size: { buffer: , offset: 0 },
key_value_metadata: null,
data_page_offset: { buffer: , offset: 0 },
index_page_offset: null,
dictionary_page_offset: null,
statistics: {
max: ,
min: ,
null_count: [Object],
distinct_count: [Object],
max_value: ,
min_value:
},
encoding_stats: null,
bloom_filter_offset: null,
offsetIndex: { page_locations: [Array] },
columnIndex: {
null_pages: null,
min_values: [Array],
max_values: [Array],
boundary_order: null,
null_counts: null
}
},
columnIndex: {
null_pages: null,
min_values: [ ],
max_values: [ ],
boundary_order: null,
null_counts: null
}
}
Update 2:
here the log for column
just before the writeSection
method will be executed ( inside the columnIndex
condition ):
// change
{
column: {
file_path: null,
file_offset: { buffer: , offset: 0 },
meta_data: {
type: 6,
encodings: [Array],
path_in_schema: [Array],
codec: 0,
num_values: [Object],
total_uncompressed_size: [Object],
total_compressed_size: [Object],
key_value_metadata: null,
data_page_offset: [Object],
index_page_offset: null,
dictionary_page_offset: null,
statistics: [Object],
encoding_stats: null,
bloom_filter_offset: null,
offsetIndex: [Object]
},
offset_index_offset: null,
offset_index_length: null,
column_index_offset: { buffer: , offset: 0 },
column_index_length: 15,
crypto_metadata: null,
encrypted_column_metadata: null
}
}
{
column: {
file_path: null,
file_offset: { buffer: , offset: 0 },
meta_data: {
type: 6,
encodings: [Array],
path_in_schema: [Array],
codec: 0,
num_values: [Object],
total_uncompressed_size: [Object],
total_compressed_size: [Object],
key_value_metadata: null,
data_page_offset: [Object],
index_page_offset: null,
dictionary_page_offset: null,
statistics: [Object],
encoding_stats: null,
bloom_filter_offset: null,
offsetIndex: [Object]
},
offset_index_offset: null,
offset_index_length: null,
column_index_offset: { buffer: , offset: 0 },
column_index_length: 19,
crypto_metadata: null,
encrypted_column_metadata: null
}
}
{
column: {
file_path: null,
file_offset: { buffer: , offset: 0 },
meta_data: {
type: 6,
encodings: [Array],
path_in_schema: [Array],
codec: 0,
num_values: [Object],
total_uncompressed_size: [Object],
total_compressed_size: [Object],
key_value_metadata: null,
data_page_offset: [Object],
index_page_offset: null,
dictionary_page_offset: null,
statistics: [Object],
encoding_stats: null,
bloom_filter_offset: null,
offsetIndex: [Object]
},
offset_index_offset: null,
offset_index_length: null,
column_index_offset: { buffer: , offset: 0 },
column_index_length: 23,
crypto_metadata: null,
encrypted_column_metadata: null
}
}
//change_activity
{
column: {
file_path: null,
file_offset: { buffer: , offset: 0 },
meta_data: {
type: 6,
encodings: [Array],
path_in_schema: [Array],
codec: 0,
num_values: [Object],
total_uncompressed_size: [Object],
total_compressed_size: [Object],
key_value_metadata: null,
data_page_offset: [Object],
index_page_offset: null,
dictionary_page_offset: null,
statistics: [Object],
encoding_stats: null,
bloom_filter_offset: null,
offsetIndex: [Object]
},
offset_index_offset: null,
offset_index_length: null,
column_index_offset: { buffer: , offset: 0 },
column_index_length: 15,
crypto_metadata: null,
encrypted_column_metadata: null
}
}
{
column: {
file_path: null,
file_offset: { buffer: , offset: 0 },
meta_data: {
type: 6,
encodings: [Array],
path_in_schema: [Array],
codec: 0,
num_values: [Object],
total_uncompressed_size: [Object],
total_compressed_size: [Object],
key_value_metadata: null,
data_page_offset: [Object],
index_page_offset: null,
dictionary_page_offset: null,
statistics: [Object],
encoding_stats: null,
bloom_filter_offset: null,
offsetIndex: [Object]
},
offset_index_offset: null,
offset_index_length: null,
column_index_offset: { buffer: , offset: 0 },
column_index_length: 31,
crypto_metadata: null,
encrypted_column_metadata: null
}
}
{
column: {
file_path: null,
file_offset: { buffer: , offset: 0 },
meta_data: {
type: 6,
encodings: [Array],
path_in_schema: [Array],
codec: 0,
num_values: [Object],
total_uncompressed_size: [Object],
total_compressed_size: [Object],
key_value_metadata: null,
data_page_offset: [Object],
index_page_offset: null,
dictionary_page_offset: null,
statistics: [Object],
encoding_stats: null,
bloom_filter_offset: null,
offsetIndex: [Object]
},
offset_index_offset: null,
offset_index_length: null,
column_index_offset: { buffer: , offset: 0 },
column_index_length: 77,
crypto_metadata: null,
encrypted_column_metadata: null
}
}
{
column: {
file_path: null,
file_offset: { buffer: , offset: 0 },
meta_data: {
type: 6,
encodings: [Array],
path_in_schema: [Array],
codec: 0,
num_values: [Object],
total_uncompressed_size: [Object],
total_compressed_size: [Object],
key_value_metadata: null,
data_page_offset: [Object],
index_page_offset: null,
dictionary_page_offset: null,
statistics: [Object],
encoding_stats: null,
bloom_filter_offset: null,
offsetIndex: [Object]
},
offset_index_offset: null,
offset_index_length: null,
column_index_offset: { buffer: , offset: 0 },
column_index_length: 27,
crypto_metadata: null,
encrypted_column_metadata: null
}
}
@wilwade if you want we can have a call to debug it, let me know or just give me a ping via discord ( nickname: noxy88 )
Then you don't have to setup everything yourself.
@noxify Thanks. I'll try to poke in some and see and likely reach out to you later this week.
@noxify
Few quick questions to make sure I am understanding everything correctly.
Which of these is the error path?
I think it is 2, but want to triple check.
There is a difference in the file structure
If you use df_change.coalesce(1).write.parquet...
it should generate a single file that is much closer to the parquetjs one
parquet-tools
I'm having issues getting it running easily, what did you use?
Index setting
If you set the pageIndex
option flag, does it all work?
const writer = await parquet.ParquetWriter.openFile(
schema,
'../generated_files/parquetjs/change.parquet',
{ pageIndex: true }
)```
Hi @wilwade
Yeah option 2 is the one which produces the error
I can provide a single parquet file generated by pyspark if it helps
didn't know the pageIndex option - will test it tomorrow
to parquet-tools - I just installed it via pip install parquet-tools
as described here: https://github.com/ktrueda/parquet-tools
Will send you an update tomorrow with the results :)
Couldn't sleep without testing it 🙈
Updated the mentioned repo with the latest changes.
I added .repartition(1)
and generated two new parquets ( via pyspark )
Here the column-index result:
parquet-tools column-index generated_files/pyspark/change_single.parquet/part-00000-dbad2470-9e44-4595-b142-7b72aab249f0-c000.snappy.parquet
row group 0:
column index for column change_id:
Boudary order: ASCENDING
null count min max
page-0 0 C-01 C-02
offset index for column change_id:
offset compressed size first row index
page-0 4 47 0
column index for column status:
Boudary order: ASCENDING
null count min max
page-0 0 closed closed
offset index for column status:
offset compressed size first row index
page-0 82 33 0
column index for column approval_status:
Boudary order: ASCENDING
null count min max
page-0 0 approved approved
offset index for column approval_status:
offset compressed size first row index
page-0 148 33 0
The result for the generated parquet file in parquetjs with {pageIndex:true}
is the following:
parquet-tools column-index generated_files/parquetjs/change_pageIndex.parquet
row group 0:
column index for column change_id:
java.io.IOException: can not read class org.apache.parquet.format.ColumnIndex: Required field 'null_pages' was not present! Struct: ColumnIndex(null_pages:null, min_values:[43 2D 30 31], max_values:[43 2D 30 32], boundary_order:null)
I have checked the different meta information ( via parquet-tools
):
// parquetjs generated - without pageIndex option
parquet-tools meta generated_files/parquetjs/change_activity.parquet
file: generated_files/parquetjs/change_activity.parquet
creator: @dsnp/parquetjs
file schema: root
--------------------------------------------------------------------------------
change_id: OPTIONAL BINARY L:STRING R:0 D:1
type: OPTIONAL BINARY L:STRING R:0 D:1
description: OPTIONAL BINARY L:STRING R:0 D:1
thenumber: OPTIONAL BINARY L:STRING R:0 D:1
row group 1: RC:2 TS:976 OFFSET:4
--------------------------------------------------------------------------------
change_id: BINARY UNCOMPRESSED DO:0 FPO:4 SZ:70/70/1,00 VC:2 ENC:PLAIN,RLE ST:[min: C-01, max: C-02, num_nulls: 0]
type: BINARY UNCOMPRESSED DO:0 FPO:135 SZ:118/118/1,00 VC:2 ENC:PLAIN,RLE ST:[min: Phase Change, max: Phase Change, num_nulls: 0]
description: BINARY UNCOMPRESSED DO:0 FPO:342 SZ:258/258/1,00 VC:2 ENC:PLAIN,RLE ST:[min: "Closed" to "Verification", max: "Plan and Schedule" to "Authorization (CAB)", num_nulls: 0]
thenumber: BINARY UNCOMPRESSED DO:0 FPO:788 SZ:106/106/1,00 VC:2 ENC:PLAIN,RLE ST:[min: activity_1, max: activity_2, num_nulls: 0]
// parquetjs generated - with pageIndex option
parquet-tools meta generated_files/parquetjs/change_activity_pageIndex.parquet
file: generated_files/parquetjs/change_activity_pageIndex.parquet
creator: @dsnp/parquetjs
file schema: root
--------------------------------------------------------------------------------
change_id: OPTIONAL BINARY L:STRING R:0 D:1
type: OPTIONAL BINARY L:STRING R:0 D:1
description: OPTIONAL BINARY L:STRING R:0 D:1
thenumber: OPTIONAL BINARY L:STRING R:0 D:1
row group 1: RC:2 TS:976 OFFSET:4
--------------------------------------------------------------------------------
change_id: BINARY UNCOMPRESSED DO:0 FPO:4 SZ:70/70/1,00 VC:2 ENC:PLAIN,RLE ST:[min: C-01, max: C-02, num_nulls: 0]
type: BINARY UNCOMPRESSED DO:0 FPO:135 SZ:118/118/1,00 VC:2 ENC:PLAIN,RLE ST:[min: Phase Change, max: Phase Change, num_nulls: 0]
description: BINARY UNCOMPRESSED DO:0 FPO:342 SZ:258/258/1,00 VC:2 ENC:PLAIN,RLE ST:[min: "Closed" to "Verification", max: "Plan and Schedule" to "Authorization (CAB)", num_nulls: 0]
thenumber: BINARY UNCOMPRESSED DO:0 FPO:788 SZ:106/106/1,00 VC:2 ENC:PLAIN,RLE ST:[min: activity_1, max: activity_2, num_nulls: 0]
// pyspark generated - with repatition(1)
parquet-tools meta generated_files/pyspark/change_activity_single.parquet/part-00000-9084972a-feef-45ba-be0b-0930443430dd-c000.snappy.parquet
file: generated_files/pyspark/change_activity_single.parquet/part-00000-9084972a-feef-45ba-be0b-0930443430dd-c000.snappy.parquet
creator: parquet-mr version 1.12.3 (build f8dced182c4c1fbdec6ccb3185537b5a01e6ed6b)
extra: org.apache.spark.version = 3.4.1
extra: org.apache.spark.sql.parquet.row.metadata = {"type":"struct","fields":[{"name":"change_id","type":"string","nullable":true,"metadata":{}},{"name":"type","type":"string","nullable":true,"metadata":{}},{"name":"description","type":"string","nullable":true,"metadata":{}},{"name":"thenumber","type":"string","nullable":true,"metadata":{}}]}
file schema: spark_schema
--------------------------------------------------------------------------------
change_id: OPTIONAL BINARY L:STRING R:0 D:1
type: OPTIONAL BINARY L:STRING R:0 D:1
description: OPTIONAL BINARY L:STRING R:0 D:1
thenumber: OPTIONAL BINARY L:STRING R:0 D:1
row group 1: RC:2 TS:276 OFFSET:4
--------------------------------------------------------------------------------
change_id: BINARY SNAPPY DO:0 FPO:4 SZ:46/44/0,96 VC:2 ENC:PLAIN,RLE,BIT_PACKED ST:[min: C-01, max: C-02, num_nulls: 0]
type: BINARY SNAPPY DO:50 FPO:87 SZ:70/66/0,94 VC:2 ENC:RLE,PLAIN_DICTIONARY,BIT_PACKED ST:[min: Phase Change, max: Phase Change, num_nulls: 0]
description: BINARY SNAPPY DO:0 FPO:120 SZ:107/109/1,02 VC:2 ENC:PLAIN,RLE,BIT_PACKED ST:[min: "Closed" to "Verification", max: "Plan and Schedule" to "Authorization (CAB)", num_nulls: 0]
thenumber: BINARY SNAPPY DO:0 FPO:227 SZ:59/57/0,97 VC:2 ENC:PLAIN,RLE,BIT_PACKED ST:[min: activity_1, max: activity_2, num_nulls: 0]
I have seen that there was a compression enabled - I disabled it / changed it to uncompressed
// pyspark generated - with repatition(1)
// and conf.set("spark.sql.parquet.compression.codec", "uncompressed")
parquet-tools meta generated_files/pyspark/change_activity_single.parquet/part-00000-30597158-78bc-4eed-8cb3-768878c5d098-c000.parquet
file: generated_files/pyspark/change_activity_single.parquet/part-00000-30597158-78bc-4eed-8cb3-768878c5d098-c000.parquet
creator: parquet-mr version 1.12.3 (build f8dced182c4c1fbdec6ccb3185537b5a01e6ed6b)
extra: org.apache.spark.version = 3.4.1
extra: org.apache.spark.sql.parquet.row.metadata = {"type":"struct","fields":[{"name":"change_id","type":"string","nullable":true,"metadata":{}},{"name":"type","type":"string","nullable":true,"metadata":{}},{"name":"description","type":"string","nullable":true,"metadata":{}},{"name":"thenumber","type":"string","nullable":true,"metadata":{}}]}
file schema: spark_schema
--------------------------------------------------------------------------------
change_id: OPTIONAL BINARY L:STRING R:0 D:1
type: OPTIONAL BINARY L:STRING R:0 D:1
description: OPTIONAL BINARY L:STRING R:0 D:1
thenumber: OPTIONAL BINARY L:STRING R:0 D:1
row group 1: RC:2 TS:277 OFFSET:4
--------------------------------------------------------------------------------
change_id: BINARY UNCOMPRESSED DO:0 FPO:4 SZ:45/45/1,00 VC:2 ENC:PLAIN,RLE,BIT_PACKED ST:[min: C-01, max: C-02, num_nulls: 0]
type: BINARY UNCOMPRESSED DO:49 FPO:84 SZ:66/66/1,00 VC:2 ENC:RLE,PLAIN_DICTIONARY,BIT_PACKED ST:[min: Phase Change, max: Phase Change, num_nulls: 0]
description: BINARY UNCOMPRESSED DO:0 FPO:115 SZ:109/109/1,00 VC:2 ENC:PLAIN,RLE,BIT_PACKED ST:[min: "Closed" to "Verification", max: "Plan and Schedule" to "Authorization (CAB)", num_nulls: 0]
thenumber: BINARY UNCOMPRESSED DO:0 FPO:224 SZ:57/57/1,00 VC:2 ENC:PLAIN,RLE,BIT_PACKED ST:[min: activity_1, max: activity_2, num_nulls: 0]
Let me know if you need more information or something else :)
Ok. I have the error reproducing locally now!
Note to future self:
On macOS I needed to use brew install parquet-cli
and then parquet column-index [file]
to use column-index
as pip install parquet-tools
didn't offer column-index (or a bunch of other things) for some reason.
brew was easier than directly building https://github.com/apache/parquet-mr/blob/master/parquet-cli/README.md
Ok. So turns out the issue was fairly small. Here's the PR if you want to test it out: https://github.com/LibertyDSNP/parquetjs/pull/94
Hi,
I have currently a problem with the generated parquet file, that I get under (yet) unknown cases the following error:
I analyzed it already a bit and found a difference in the schema.
To check the schema, I have used
pqrs
( https://github.com/manojkarthick/pqrs ).In the parquet file which is generated via parquetjs, the
metadata
information is empty.In the parquet file which was generated via pyspark, where the metadata is filled:
Here the snippet which I use to generate the parquet file:
Steps to reproduce
Currently I haven't found the code snippet which produces the error.
Running
directly in a Jupyter Notebook doesn't trigger the error. It could be somewhere in our calculations/joins which we call in the original script, but I have to analyze this.
Expected behaviour
The error isn't shown 🙈
Any other comments?
Tbh. I'm not sure if the missing metadata is the rootcause for this issue.
In the next days I will try to provide some example files with the relevant python code to trigger the error, but I have to finish my work at first to make sure our customer is happy :)
Update 1:
Current workaround is to have a notebook which reads the parquet file and saves it with a new name:
This solves the issue for now - but not really what I want :D
Update 2:
While trying to find someone else with the same issue, I have found the following: https://repost.aws/questions/QUSdc0Pgo9RtSoHOSBwTi8PQ/hive-cannot-open-split-can-not-read-class-org-apache-parquet-format-columnindex