jorgecarleitao / arrow2

Transmute-free Rust library to work with the Arrow format
Apache License 2.0
1.07k stars 223 forks source link

Possible bug in mmap of complex types #1453

Closed andy-thomason closed 1 year ago

andy-thomason commented 1 year ago

I'm using columns with a schema of List(Struct(Dictionary(...))) to represent indexed database tables with compact keys.

I'm getting the error:

Error: OutOfSpec("buffer's length is too small in mmap")

When I use the mmap chunk reader, but the schema works fine with the regular reader.

I've managed to cut my failing example down to the following code, but I'll concede that I may have cut too far!

This shows a working and failing example.

use arrow2::{datatypes::{Schema, DataType, Field, IntegerType}, array::{DictionaryArray, FixedSizeBinaryArray, Int32Array, ListArray, StructArray}, chunk::Chunk, offset::OffsetsBuffer};

type BDE = Box<dyn std::error::Error + Send + Sync + 'static>;

fn main() -> Result<(), BDE> {
    let result = vec![];
    let mut writer = arrow2::io::ipc::write::FileWriter::try_new(
        result,
        schema(),
        None,
        Default::default(),
    )?;

    let keys = Int32Array::new_empty(DataType::Int32);
    let values = FixedSizeBinaryArray::new_empty(DataType::FixedSizeBinary(20));

    let accounts_arrays = vec![
        DictionaryArray::<i32>::try_from_keys(keys, values.boxed())?.boxed(),
    ];

    let chunk = Chunk::try_new(vec![
        ListArray::try_new(
            DataType::List(Box::new(Field::new(
                "list",
                DataType::Struct(accounts_fields()),
                false,
            ))),
            OffsetsBuffer::<i32>::try_from(vec![0, 0])?,
            StructArray::try_new(
                DataType::Struct(accounts_fields()),
                accounts_arrays,
                None,
            )?
            .boxed(),
            None,
        )?
        .boxed(),
    ])?;

    writer.write(&chunk, None)?;
    writer.finish()?;
    let bytes = writer.into_inner();

    // we first read the files' metadata
    let mut reader = std::io::Cursor::new(&bytes);
    let metadata = arrow2::io::ipc::read::read_file_metadata(&mut reader)?;

    if true {
        let data = std::sync::Arc::new(bytes);

        let dictionaries =
        unsafe { arrow2::mmap::mmap_dictionaries_unchecked(&metadata, data.clone())? };

        let _fails = unsafe { arrow2::mmap::mmap_unchecked(&metadata, &dictionaries, data, 0)? };
    } else {
        let mut reader = arrow2::io::ipc::read::FileReader::new(reader, metadata, None, None);
        let _works = reader.next().ok_or("no chunks")??;
    }
    Ok(())
}

fn schema() -> Schema {
    use DataType::*;
    Schema::from(vec![
        Field::new(
            "accounts",
            List(Box::new(Field::new(
                "list",
                Struct(accounts_fields()),
                false,
            ))),
            false,
        ),
    ])
}

fn accounts_fields() -> Vec<Field> {
    use DataType::*;
    use IntegerType::Int32;
    vec![
        // Address: 32 bits
        Field::new(
            "address",
            Dictionary(Int32, Box::new(FixedSizeBinary(20)), false),
            false,
        ),
    ]
}
andy-thomason commented 1 year ago

Happy to submit a PR if I get there first.

andy-thomason commented 1 year ago

The problem here may be that FixedSizeBinary with zero elements fails the round trip.