Open Jonarod opened 10 months ago
I am quite not sure to understand why, but ipc files created by arrow are not readable by arrow2.
arrow
arrow2
Please see reproduction:
[dependencies] arrow2 = { version = "0.18.0", features = ["io_ipc", "io_ipc_compression"]} arrow-schema = "50.0.0" arrow-array = "50.0.0" arrow-ipc = { version = "50.0.0", features = ["lz4", "zstd"]}
use std::{ sync::Arc, fs::File }; fn check_using_arrow(filepath: &str) { let file = File::open(&filepath).unwrap(); let mut reader = arrow_ipc::reader::FileReader::try_new(&file, None).unwrap(); if let Some(maybe_batch) = reader.next() { match maybe_batch { Ok(_) => { println!("✅ {:?} Recognized by Arrow", filepath); }, Err(e) => { println!("❌ {:?} Not recognized by Arrow: {:?}", filepath, e); } } } } fn check_using_arrow2(filepath: &str) { let file = File::open(&filepath).unwrap(); let metadata = arrow2::io::ipc::read::read_file_metadata(&mut File::open(filepath).unwrap()).unwrap().clone(); let mut reader = arrow2::io::ipc::read::FileReader::new(&file, metadata, None, None); if let Some(maybe_chunk) = reader.next() { match maybe_chunk { Ok(_) => { println!("✅ {:?} Recognized by Arrow2", filepath); }, Err(e) => { println!("❌ {:?} Not recognized by Arrow2: {:?}", filepath, e); } } } } fn create_new_file_using_arrow(file: &str, compression: Option<arrow_ipc::CompressionType>) { let batch = arrow_array::RecordBatch::try_from_iter(vec![ ("col", Arc::new(arrow_array::Float64Array::from(vec![1.0])) as arrow_array::ArrayRef), ]).unwrap(); let mut file = File::create(&file).unwrap(); let options = arrow_ipc::writer::IpcWriteOptions::try_new(8, false, arrow_ipc::MetadataVersion::V5) .unwrap() .try_with_compression(compression) .unwrap(); { let mut writer = arrow_ipc::writer::FileWriter::try_new_with_options(&mut file, &batch.schema(), options).unwrap(); writer.write(&batch).unwrap(); writer.finish().unwrap(); } } fn create_new_file_using_arrow2(file: &str, compression: Option<arrow2::io::ipc::write::Compression>) { let fields = vec![ arrow2::datatypes::Field::new(String::from("col"), arrow2::datatypes::DataType::Float64, false), ]; let schema = arrow2::datatypes::Schema::from(fields); let mut file = File::create(&file).unwrap(); let options = arrow2::io::ipc::write::WriteOptions { compression }; let col = arrow2::array::PrimitiveArray::from_vec(vec![1.0]); let mut writer = arrow2::io::ipc::write::FileWriter::try_new(&mut file, schema, None, options).unwrap(); writer.write(&arrow2::chunk::Chunk::new(vec![Box::new(col)]), None).unwrap(); writer.finish().unwrap(); } fn main(){ let filepath_arrow_no_compress = "./created_by_arrow_without_compression.ipc"; create_new_file_using_arrow(filepath_arrow_no_compress, None); check_using_arrow(filepath_arrow_no_compress); check_using_arrow2(filepath_arrow_no_compress); let filepath_arrow_zstd = "./created_by_arrow_with_zstd.ipc"; create_new_file_using_arrow(filepath_arrow_zstd, Some(arrow_ipc::CompressionType::ZSTD)); check_using_arrow(filepath_arrow_zstd); check_using_arrow2(filepath_arrow_zstd); let filepath_arrow_lz4 = "./created_by_arrow_with_lz4.ipc"; create_new_file_using_arrow(filepath_arrow_lz4, Some(arrow_ipc::CompressionType::LZ4_FRAME)); check_using_arrow(filepath_arrow_lz4); check_using_arrow2(filepath_arrow_lz4); }
yields:
✅ "./created_by_arrow_without_compression.ipc" Recognized by Arrow ✅ "./created_by_arrow_without_compression.ipc" Recognized by Arrow2 ✅ "./created_by_arrow_with_zstd.ipc" Recognized by Arrow ❌ "./created_by_arrow_with_zstd.ipc" Not recognized by Arrow2: Io(Custom { kind: Other, error: "Unknown frame descriptor" }) ✅ "./created_by_arrow_with_lz4.ipc" Recognized by Arrow ❌ "./created_by_arrow_with_lz4.ipc" Not recognized by Arrow2: Io(Custom { kind: Other, error: LZ4Error("ERROR_frameType_unknown") })
As you can see, whenever ipc files are compressed using Arrow, then arrow2 does not recognize it.
I am quite not sure to understand why, but ipc files created by
arrow
are not readable byarrow2
.Please see reproduction:
yields:
As you can see, whenever ipc files are compressed using Arrow, then arrow2 does not recognize it.