apache / arrow

Apache Arrow is a multi-language toolbox for accelerated data interchange and in-memory processing
https://arrow.apache.org/
Apache License 2.0
14.36k stars 3.49k forks source link

[c++] arrow::json::TableReader::Make ignores nullable=False in explicit_schema parse_options #43115

Open amassalha opened 3 months ago

amassalha commented 3 months ago

When using arrow::json::TableReader::Make and sending explicit schema with one of it fields is struct of REQURIED field, the nullability is not passing to the parsed column type. see test:


#include <arrow/api.h>
#include <arrow/io/api.h>
#include <arrow/ipc/api.h>
#include <arrow/json/api.h>
#include <parquet/arrow/reader.h>
#include <parquet/arrow/writer.h>
#include "parquet/schema.h"
#include <fstream>
#include <cstdint>
#include <string>
#include <iostream>
#include <memory>
#include <vector>

std::tuple<std::shared_ptr<parquet::WriterProperties>, std::shared_ptr<parquet::ArrowWriterProperties>>
  CreateProperties() {
  parquet::WriterProperties::Builder properties_builder;
  properties_builder.max_row_group_length(std::size_t(parquet::DEFAULT_MAX_ROW_GROUP_LENGTH));
  properties_builder.data_pagesize(std::size_t(parquet::kDefaultDataPageSize));
  properties_builder.write_batch_size(std::size_t(parquet::DEFAULT_WRITE_BATCH_SIZE));
  auto properties = properties_builder.build();
  parquet::ArrowWriterProperties::Builder arrow_properties_builder;
  auto arrow_properties = arrow_properties_builder.build();
  return std::make_tuple(properties, arrow_properties);
}
int main() {
    auto schema = arrow::schema({
        arrow::field("nullable_field", arrow::int32(), true),
        arrow::field("non_nullable_field", arrow::int32(), false),
        arrow::field("struct_1", arrow::struct_({
                arrow::field("non_nullable_struct_field", arrow::int32(), false) //**This nullability is ignored
            }))
    });

    std::string json_data = R"(
        {"nullable_field": 1, "non_nullable_field": 2, "struct_1": {"non_nullable_struct_field": 7}}
    )";

    auto buffer = std::make_shared<arrow::Buffer>(json_data);
    arrow::json::ParseOptions parse_options = arrow::json::ParseOptions::Defaults();
    arrow::json::ReadOptions read_options = arrow::json::ReadOptions::Defaults();
    read_options.use_threads = false;
    read_options.block_size = json_data.size();
    parse_options.explicit_schema = schema;
    auto input = std::make_shared<arrow::io::BufferReader>(buffer);
    auto json_reader_result = arrow::json::TableReader::Make(arrow::default_memory_pool(), input, read_options, parse_options);
    if (!json_reader_result.ok()) {
        throw std::runtime_error("Failed to create JSON TableReader: " + json_reader_result.status().ToString());
    }
    auto json_reader = *json_reader_result;
    auto read_table_result = json_reader->Read();
    if (!read_table_result.ok()) {
        throw std::runtime_error("Failed to read table from JSON: " + read_table_result.status().ToString());
    }

    std::shared_ptr<arrow::Table> table = *read_table_result;
   auto typed_table = arrow::Table::Make(schema, table->columns());
  std::shared_ptr<arrow::io::BufferOutputStream> out_stream;
  PARQUET_ASSIGN_OR_THROW(out_stream, arrow::io::BufferOutputStream::Create());
  auto [properties, arrow_properties] = CreateProperties();

  //  The fail is in this function it calls "RETURN_NOT_OK(table.Validate());" and fails due to diff in schemas: the "columns" schema is missing the "Nullability" of the inner struct field (uses OPTIONAL as default) but the explicit schema saves it as REQUIRED
  PARQUET_THROW_NOT_OK(
      parquet::arrow::WriteTable(*typed_table, arrow::default_memory_pool(), out_stream, std::size_t(parquet::DEFAULT_MAX_ROW_GROUP_LENGTH), properties,
                                 arrow_properties)
  );

  return 0;
}

it fails on: terminate called after throwing an instance of 'parquet::ParquetStatusException' what(): Invalid: Column data for field 2 with type struct is inconsistent with schema struct Aborted

Component(s)

C++

amassalha commented 3 months ago

related to https://github.com/apache/arrow/issues/31957