jorgecarleitao / arrow2

Transmute-free Rust library to work with the Arrow format
Apache License 2.0
1.07k stars 221 forks source link

arrow2 cannot read ipc files compressed by official's arrow crate #1605

Open Jonarod opened 5 months ago

Jonarod commented 5 months ago

I am quite not sure to understand why, but ipc files created by arrow are not readable by arrow2.

Please see reproduction:

[dependencies]
arrow2 = { version = "0.18.0", features = ["io_ipc", "io_ipc_compression"]}
arrow-schema = "50.0.0"
arrow-array = "50.0.0"
arrow-ipc = { version = "50.0.0", features = ["lz4", "zstd"]}
use std::{
  sync::Arc,
  fs::File
};

fn check_using_arrow(filepath: &str) {
  let file = File::open(&filepath).unwrap();
  let mut reader = arrow_ipc::reader::FileReader::try_new(&file, None).unwrap();
  if let Some(maybe_batch) = reader.next() {
    match maybe_batch {
      Ok(_) => {
        println!("✅ {:?} Recognized by Arrow", filepath);
      },
      Err(e) => {
        println!("❌ {:?} Not recognized by Arrow: {:?}", filepath, e);
      }
    }
  }
}

fn check_using_arrow2(filepath: &str) {
  let file = File::open(&filepath).unwrap();
  let metadata = arrow2::io::ipc::read::read_file_metadata(&mut File::open(filepath).unwrap()).unwrap().clone();
  let mut reader = arrow2::io::ipc::read::FileReader::new(&file, metadata, None, None);
  if let Some(maybe_chunk) =  reader.next() {
    match maybe_chunk {
      Ok(_) => {
        println!("✅ {:?} Recognized by Arrow2", filepath);
      },
      Err(e) => {
        println!("❌ {:?} Not recognized by Arrow2: {:?}", filepath, e);
      }
    }
  }
}

fn create_new_file_using_arrow(file: &str, compression: Option<arrow_ipc::CompressionType>) {
  let batch = arrow_array::RecordBatch::try_from_iter(vec![
    ("col", Arc::new(arrow_array::Float64Array::from(vec![1.0])) as arrow_array::ArrayRef),
  ]).unwrap();

  let mut file = File::create(&file).unwrap();
  let options = arrow_ipc::writer::IpcWriteOptions::try_new(8, false, arrow_ipc::MetadataVersion::V5)
                .unwrap()
                .try_with_compression(compression)
                .unwrap();
  {
    let mut writer = arrow_ipc::writer::FileWriter::try_new_with_options(&mut file, &batch.schema(), options).unwrap();
    writer.write(&batch).unwrap();
    writer.finish().unwrap();
  }
}

fn create_new_file_using_arrow2(file: &str, compression: Option<arrow2::io::ipc::write::Compression>) {
  let fields = vec![
    arrow2::datatypes::Field::new(String::from("col"), arrow2::datatypes::DataType::Float64, false),
  ];
  let schema = arrow2::datatypes::Schema::from(fields);

  let mut file = File::create(&file).unwrap();
  let options =  arrow2::io::ipc::write::WriteOptions {
    compression
  };

  let col = arrow2::array::PrimitiveArray::from_vec(vec![1.0]);

  let mut writer = arrow2::io::ipc::write::FileWriter::try_new(&mut file, schema, None, options).unwrap();
  writer.write(&arrow2::chunk::Chunk::new(vec![Box::new(col)]), None).unwrap();
  writer.finish().unwrap();
}

fn main(){
  let filepath_arrow_no_compress = "./created_by_arrow_without_compression.ipc";
  create_new_file_using_arrow(filepath_arrow_no_compress, None);
  check_using_arrow(filepath_arrow_no_compress);
  check_using_arrow2(filepath_arrow_no_compress);

  let filepath_arrow_zstd = "./created_by_arrow_with_zstd.ipc";
  create_new_file_using_arrow(filepath_arrow_zstd, Some(arrow_ipc::CompressionType::ZSTD));
  check_using_arrow(filepath_arrow_zstd);
  check_using_arrow2(filepath_arrow_zstd);

  let filepath_arrow_lz4 = "./created_by_arrow_with_lz4.ipc";
  create_new_file_using_arrow(filepath_arrow_lz4, Some(arrow_ipc::CompressionType::LZ4_FRAME));
  check_using_arrow(filepath_arrow_lz4);
  check_using_arrow2(filepath_arrow_lz4);

}

yields:

✅ "./created_by_arrow_without_compression.ipc" Recognized by Arrow
✅ "./created_by_arrow_without_compression.ipc" Recognized by Arrow2
✅ "./created_by_arrow_with_zstd.ipc" Recognized by Arrow
❌ "./created_by_arrow_with_zstd.ipc" Not recognized by Arrow2: Io(Custom { kind: Other, error: "Unknown frame descriptor" })
✅ "./created_by_arrow_with_lz4.ipc" Recognized by Arrow
❌ "./created_by_arrow_with_lz4.ipc" Not recognized by Arrow2: Io(Custom { kind: Other, error: LZ4Error("ERROR_frameType_unknown") })

As you can see, whenever ipc files are compressed using Arrow, then arrow2 does not recognize it.