When using arrow::json::TableReader::Make and sending explicit schema with one of it fields is struct of REQURIED field, the nullability is not passing to the parsed column type. see test:
int main() {
auto schema = arrow::schema({
arrow::field("nullable_field", arrow::int32(), true),
arrow::field("non_nullable_field", arrow::int32(), false),
arrow::field("struct_1", arrow::struct_({
arrow::field("non_nullable_struct_field", arrow::int32(), false) //**This nullability is ignored
}))
});
std::string json_data = R"(
{"nullable_field": 1, "non_nullable_field": 2, "struct_1": {"non_nullable_struct_field": 7}}
)";
auto buffer = std::make_shared<arrow::Buffer>(json_data);
arrow::json::ParseOptions parse_options = arrow::json::ParseOptions::Defaults();
arrow::json::ReadOptions read_options = arrow::json::ReadOptions::Defaults();
read_options.use_threads = false;
read_options.block_size = json_data.size();
parse_options.explicit_schema = schema;
auto input = std::make_shared<arrow::io::BufferReader>(buffer);
auto json_reader_result = arrow::json::TableReader::Make(arrow::default_memory_pool(), input, read_options, parse_options);
if (!json_reader_result.ok()) {
throw std::runtime_error("Failed to create JSON TableReader: " + json_reader_result.status().ToString());
}
auto json_reader = *json_reader_result;
auto read_table_result = json_reader->Read();
if (!read_table_result.ok()) {
throw std::runtime_error("Failed to read table from JSON: " + read_table_result.status().ToString());
}
std::shared_ptr<arrow::Table> table = *read_table_result;
auto typed_table = arrow::Table::Make(schema, table->columns());
std::shared_ptr<arrow::io::BufferOutputStream> out_stream;
PARQUET_ASSIGN_OR_THROW(out_stream, arrow::io::BufferOutputStream::Create());
auto [properties, arrow_properties] = CreateProperties();
// The fail is in this function it calls "RETURN_NOT_OK(table.Validate());" and fails due to diff in schemas: the "columns" schema is missing the "Nullability" of the inner struct field (uses OPTIONAL as default) but the explicit schema saves it as REQUIRED
PARQUET_THROW_NOT_OK(
parquet::arrow::WriteTable(*typed_table, arrow::default_memory_pool(), out_stream, std::size_t(parquet::DEFAULT_MAX_ROW_GROUP_LENGTH), properties,
arrow_properties)
);
return 0;
}
it fails on:
terminate called after throwing an instance of 'parquet::ParquetStatusException'
what(): Invalid: Column data for field 2 with type struct is inconsistent with schema struct
Aborted
When using arrow::json::TableReader::Make and sending explicit schema with one of it fields is struct of REQURIED field, the nullability is not passing to the parsed column type. see test:
it fails on: terminate called after throwing an instance of 'parquet::ParquetStatusException' what(): Invalid: Column data for field 2 with type struct is inconsistent with schema struct
Aborted
Component(s)
C++