datafusion-contrib / datafusion-orc

Implementation of Apache ORC file format use Apache Arrow in-memory format
Apache License 2.0
28 stars 8 forks source link

`unreachable!()` in rle_v2_decode_bit_width is reachable #88

Closed progval closed 1 month ago

progval commented 1 month ago

the unreachable!() statement in this function:

https://github.com/datafusion-contrib/datafusion-orc/blob/1ee1df9c7ca4cf198e4a1a8fea0af3f31e2d17cd/src/reader/decode/util.rs#L225-L236

can be reached while reading files created with pyorc. To reproduce:

wget https://softwareheritage.s3.amazonaws.com/graph/2023-09-06/orc/release/release-00d4739e-c723-4843-863e-e4a895c58005.orc

then run this code with ./release-00d4739e-c723-4843-863e-e4a895c58005.orc as parameter:

use std::fs::File;
use std::path::PathBuf;

use anyhow::Result;
use orc_rust::arrow_reader::ArrowReaderBuilder;
use orc_rust::projection::ProjectionMask;

pub fn main() -> Result<()> {
    let file_path = PathBuf::from(std::env::args().skip(1).next().unwrap());
    println!("reading {}", file_path.display());
    let file = File::open(&file_path)?;
    let reader_builder = ArrowReaderBuilder::try_new(file)?;
    let projection = ProjectionMask::named_roots(
        reader_builder.file_metadata().root_data_type(),
        ["date"].as_slice(),
    );
    let reader = reader_builder
        .with_projection(projection)
        .with_batch_size(10)
        .build();
    for (i, _) in reader.enumerate() {
        println!("chunk {}", i);
    }

    Ok(())
}

and this small patch to datafusion-orc:

diff --git a/src/reader/decode/util.rs b/src/reader/decode/util.rs
index 468bd7d..2d5bad6 100644
--- a/src/reader/decode/util.rs
+++ b/src/reader/decode/util.rs
@@ -231,7 +231,7 @@ pub fn rle_v2_decode_bit_width(encoded: u8) -> usize {
         29 => 48,
         30 => 56,
         31 => 64,
-        _ => unreachable!(),
+        _ => unreachable!("rle_v2_decode_bit_width({})", encoded),
     }
 }

which prints:

[...]
chunk 1715
chunk 1716
chunk 1717
chunk 1718
chunk 1719
chunk 1720
chunk 1721
chunk 1722
chunk 1723
chunk 1724
thread 'main' panicked at /home/vlorentz/datafusion-orc/src/reader/decode/util.rs:234:14:
internal error: entered unreachable code: rle_v2_decode_bit_width(26)
stack backtrace:
   0: rust_begin_unwind
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panicking.rs:645:5
   1: core::panicking::panic_fmt
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/panicking.rs:72:14
   2: orc_rust::reader::decode::util::rle_v2_decode_bit_width
             at /home/vlorentz/datafusion-orc/src/reader/decode/util.rs:234:14
   3: orc_rust::reader::decode::rle_v2::patched_base::<impl orc_rust::reader::decode::rle_v2::RleReaderV2<N,R>>::read_patched_base
             at /home/vlorentz/datafusion-orc/src/reader/decode/rle_v2/patched_base.rs:32:31
   4: orc_rust::reader::decode::rle_v2::RleReaderV2<N,R>::decode_batch
             at /home/vlorentz/datafusion-orc/src/reader/decode/rle_v2/mod.rs:38:42
   5: <orc_rust::reader::decode::rle_v2::RleReaderV2<N,R> as core::iter::traits::iterator::Iterator>::next
             at /home/vlorentz/datafusion-orc/src/reader/decode/rle_v2/mod.rs:53:19
   6: <alloc::boxed::Box<I,A> as core::iter::traits::iterator::Iterator>::next
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/alloc/src/boxed.rs:1949:9
   7: <&mut I as core::iter::traits::iterator::Iterator>::next
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/iter/traits/iterator.rs:4169:9
   8: <core::iter::adapters::zip::Zip<A,B> as core::iter::adapters::zip::ZipImpl<A,B>>::next
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/iter/adapters/zip.rs:166:21
   9: <core::iter::adapters::zip::Zip<A,B> as core::iter::traits::iterator::Iterator>::next
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/iter/adapters/zip.rs:85:9
  10: <orc_rust::arrow_reader::column::timestamp::TimestampIterator as core::iter::traits::iterator::Iterator>::next
             at /home/vlorentz/datafusion-orc/src/arrow_reader/column/timestamp.rs:31:13
  11: <alloc::boxed::Box<I,A> as core::iter::traits::iterator::Iterator>::next
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/alloc/src/boxed.rs:1949:9
  12: orc_rust::arrow_reader::decoder::PrimitiveArrayDecoder<T>::next_primitive_batch
             at /home/vlorentz/datafusion-orc/src/arrow_reader/decoder/mod.rs:65:35
  13: <orc_rust::arrow_reader::decoder::timestamp::TimestampOffsetArrayDecoder as orc_rust::arrow_reader::decoder::ArrayBatchDecoder>::next_batch
             at /home/vlorentz/datafusion-orc/src/arrow_reader/decoder/timestamp.rs:109:21
  14: orc_rust::arrow_reader::decoder::NaiveStripeDecoder::inner_decode_next_batch
             at /home/vlorentz/datafusion-orc/src/arrow_reader/decoder/mod.rs:408:25
  15: orc_rust::arrow_reader::decoder::NaiveStripeDecoder::decode_next_batch
             at /home/vlorentz/datafusion-orc/src/arrow_reader/decoder/mod.rs:420:22
  16: <orc_rust::arrow_reader::decoder::NaiveStripeDecoder as core::iter::traits::iterator::Iterator>::next
             at /home/vlorentz/datafusion-orc/src/arrow_reader/decoder/mod.rs:288:26
  17: <alloc::boxed::Box<I,A> as core::iter::traits::iterator::Iterator>::next
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/alloc/src/boxed.rs:1949:9
  18: <orc_rust::arrow_reader::ArrowReader<R> as core::iter::traits::iterator::Iterator>::next
             at /home/vlorentz/datafusion-orc/src/arrow_reader/mod.rs:159:23
  19: <core::iter::adapters::enumerate::Enumerate<I> as core::iter::traits::iterator::Iterator>::next
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/iter/adapters/enumerate.rs:47:17
  20: repro::main
             at ./rust/src/bin/repro.rs:22:19
  21: core::ops::function::FnOnce::call_once
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ops/function.rs:250:5
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.
Jefffrey commented 1 month ago

Thanks for finding this, I will take a look; should be a quick fix 🙏

Jefffrey commented 1 month ago

Resolved by https://github.com/datafusion-contrib/datafusion-orc/commit/49318525356a926f57447f2a95c64d228015f8aa