Parquet generated with complex structure is not readable by AWS Athena.
Create file with this code:
var schema = new ParquetSchema(
new DataField<int>("id"),
new ListField("animals",
new StructField("animal",
new DataField<string>("animal_name"),
new StructField("characteristics",
new DataField<string>("overview"),
new ListField("likes",
new StructField("like",
new DataField<string>("like_name"),
new StructField("favourite",
new DataField<string>("favourite_name")
)
)
)
)
)
)
);
var table = new Table(schema)
{
{
1,
new[]
{
new Row
(
"Dog",
new Row
(
"4 legs, energetic",
new []
{
new Row("food",new Row("meat")),
new Row("play",new Row("fetch")) // delete this line and the parquet is readable in Athena
}
)
)
}
}
};
File.Delete("file.parquet");
await table.WriteAsync("file.parquet");
Create Athena table and upload generated parquet to S3 location specified in CREATE TABLE statement
Run SELECT * FROM greg_test gives HIVE_CURSOR_ERROR: Failed to read Parquet file.
If I remove the play row from the nested list the data is readable in Athena.
If I read the definition and repetition levels I can see that the repetition level is greater than the MaxRepitionLevel for fields like_name and favourite_name
// read back
using var memoryStream = new MemoryStream();
await table.WriteAsync(memoryStream);
using ParquetReader reader = await ParquetReader.CreateAsync(memoryStream);
using ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(0);
var dataFields = reader.Schema.GetDataFields();
foreach (var dataField in dataFields)
{
Console.WriteLine($"Reading field {dataField.Name}...");
Console.WriteLine($"MaxDefinitionLevel {dataField.MaxDefinitionLevel}");
Console.WriteLine($"MaxRepetitionLevel {dataField.MaxRepetitionLevel}");
var column = await rowGroupReader.ReadColumnAsync(dataField);
Console.WriteLine("Column definition levels:");
column.DefinitionLevels?.ToList().ForEach(x => Console.WriteLine(x));
Console.WriteLine("Column repetition levels:");
column.RepetitionLevels?.ToList().ForEach(x => Console.WriteLine(x));
Console.WriteLine();
}
using Parquet;
using Parquet.Rows;
using Parquet.Schema;
namespace ParquetPackageWithListOfStructs
{
public class ParquetBuilder
{
public async Task Build()
{
var schema = new ParquetSchema(
new DataField<int>("id"),
new ListField("animals",
new StructField("animal",
new DataField<string>("animal_name"),
new StructField("characteristics",
new DataField<string>("overview"),
new ListField("likes",
new StructField("like",
new DataField<string>("like_name"),
new StructField("favourite",
new DataField<string>("favourite_name")
)
)
)
)
)
)
);
var table = new Table(schema)
{
{
1,
new[]
{
new Row
(
"Dog",
new Row
(
"4 legs, energetic",
new []
{
new Row("food",new Row("meat")),
new Row("play",new Row("fetch")) // delete this line and the parquet is readable in Athena
}
)
)
}
}
};
File.Delete("file.parquet");
await table.WriteAsync("file.parquet");
// read back
using var memoryStream = new MemoryStream();
await table.WriteAsync(memoryStream);
using ParquetReader reader = await ParquetReader.CreateAsync(memoryStream);
using ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(0);
var dataFields = reader.Schema.GetDataFields();
foreach (var dataField in dataFields)
{
Console.WriteLine($"Reading field {dataField.Name}...");
Console.WriteLine($"MaxDefinitionLevel {dataField.MaxDefinitionLevel}");
Console.WriteLine($"MaxRepetitionLevel {dataField.MaxRepetitionLevel}");
var column = await rowGroupReader.ReadColumnAsync(dataField);
Console.WriteLine("Column definition levels:");
column.DefinitionLevels?.ToList().ForEach(x => Console.WriteLine(x));
Console.WriteLine("Column repetition levels:");
column.RepetitionLevels?.ToList().ForEach(x => Console.WriteLine(x));
Console.WriteLine();
}
}
}
}
Library Version
4.24.0
OS
Windows, Ubuntu Linux
OS Architecture
64 bit
How to reproduce?
Parquet generated with complex structure is not readable by AWS Athena.
Create file with this code:
Create Athena table and upload generated parquet to S3 location specified in CREATE TABLE statement
Run
SELECT * FROM greg_test
givesHIVE_CURSOR_ERROR: Failed to read Parquet file
.If I remove the
play
row from the nested list the data is readable in Athena.If I read the definition and repetition levels I can see that the repetition level is greater than the MaxRepitionLevel for fields
like_name
andfavourite_name
Failing test