jorgecarleitao / arrow2

Transmute-free Rust library to work with the Arrow format
Apache License 2.0
1.07k stars 221 forks source link

`infer_records_schema` results in incorrect `Schema` when input json is in non-`Chunk` form #1578

Open mnpw opened 9 months ago

mnpw commented 9 months ago

Issue

arrow2::io::json::read::infer_records_schema function reads the first element of the Json array and infers the Schema based on it. This can give faulty Schema if not all values of Json array contain all the fields. In essence, infer_records_schema function assumes the input Json to already have a structure of arrow2::chunk::Chunk

Potential Solution

Rather than reading just the first element, infer_records_schema should read all the elements, coerce Schemas of all those elements. This is how infer function creates the DataType (see this)

Issue Example

// json
let json = [ 
  {a: 0, c: "hello"},
  {a: 1, b: false, c: "hello"},
  {a: 2, c: "world", d: 3.14}
]

// schema
let schema = io::json::read::infer_records_schema(&json).unwrap();
println!("schema:#?");
// ⚠️ Schema has fields corresponding to only the first value in Json array
//
// Schema {
//     fields: [
//         Field {
//             name: "a",
//             data_type: Int64,
//             is_nullable: true,
//             metadata: {},
//         },
//         Field {
//             name: "c",
//             data_type: Utf8,
//             is_nullable: true,
//             metadata: {},
//         },
//     ],
//     metadata: {},
// }

// datatype
let data_type = io::json::read::infer(&json).unwrap()
println!("data_type:#?");
// ⚠️ DataType has fields corresponding to all values in Json array
//
// List(
//     Field {
//         name: "item",
//         data_type: Struct(
//             [
//                 Field {
//                     name: "a",
//                     data_type: Int64,
//                     is_nullable: true,
//                     metadata: {},
//                 },
//                 Field {
//                     name: "c",
//                     data_type: Utf8,
//                     is_nullable: true,
//                     metadata: {},
//                 },
//                 Field {
//                     name: "b",
//                     data_type: Boolean,
//                     is_nullable: true,
//                     metadata: {},
//                 },
//                 Field {
//                     name: "d",
//                     data_type: Float64,
//                     is_nullable: true,
//                     metadata: {},
//                 },
//             ],
//         ),
//         is_nullable: true,
//         metadata: {},
//     },
// )