datafusion-contrib / datafusion-orc

Implementation of Apache ORC file format use Apache Arrow in-memory format
Apache License 2.0
28 stars 8 forks source link

Out of spec, message: combined patch width and value width cannot exceed the size of the integer type being decoded #97

Open progval opened 1 week ago

progval commented 1 week ago

this check:

https://github.com/datafusion-contrib/datafusion-orc/blob/16b57041e821bebeee205f09e684e616416db767/src/reader/decode/rle_v2/patched_base.rs#L55-L60

can be falsified while reading files created with pyorc. To reproduce:

wget https://softwareheritage.s3.amazonaws.com/graph/2024-05-16/orc/revision/revision-0c45576a-59f7-48d1-a9a8-2e5c64098905.orc

(sorry, it's 4GB. I don't have a smaller example on hand)

then checkout 28e911bdf815bdcd8cc1225a4afc0627902400ef (a commit from https://github.com/datafusion-contrib/datafusion-orc/pull/96 because it's the only way not to hit an overflow crash before this bug), apply this patch:

diff --git a/Cargo.toml b/Cargo.toml
index 2af67e1..ecec249 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -69,6 +70,10 @@ required-features = ["datafusion"]
 # Some issue when publishing and path isn't specified, so adding here
 path = "./examples/datafusion_integration.rs"

+[[example]]
+name = "repro"
+required-features = ["cli"]
+
 [[bin]]
 name = "orc-metadata"
 required-features = ["cli"]
diff --git a/src/reader/decode/rle_v2/patched_base.rs b/src/reader/decode/rle_v2/patched_base.rs
index c33815b..c149ea4 100644
--- a/src/reader/decode/rle_v2/patched_base.rs
+++ b/src/reader/decode/rle_v2/patched_base.rs
@@ -53,6 +53,7 @@ impl<N: NInt, R: Read> RleReaderV2<N, R> {
             .fail();
         }
         if (patch_bit_width + value_bit_width) > (N::BYTE_SIZE * 8) {
+            eprintln!("patch_bit_width= {} value_bit_width= {} N::BYTE_SIZE= {}", patch_bit_width, value_bit_width, N::BYTE_SIZE);
             return OutOfSpecSnafu {
                 msg: "combined patch width and value width cannot exceed the size of the integer type being decoded",
             }

then run this code with ./revision-0c45576a-59f7-48d1-a9a8-2e5c64098905.orc as parameter:

use std::fs::File;
use std::path::PathBuf;
use std::sync::Arc;

use anyhow::{Context, Result};
use arrow::datatypes::{DataType, Decimal128Type, DecimalType, Schema};
use orc_rust::arrow_reader::ArrowReaderBuilder;
use orc_rust::projection::ProjectionMask;
//use rayon::prelude::*;

fn transform_schema(schema: &Schema) -> Arc<Schema> {
    Arc::new(Schema::new(
        schema
            .fields()
            .iter()
            .cloned()
            .map(|field| match field.data_type() {
                DataType::Timestamp(_, _) => (*field)
                    .clone()
                    //.with_data_type(DataType::Timestamp(TimeUnit::Microsecond, tz.clone())),
                    .with_data_type(DataType::Decimal128(Decimal128Type::MAX_SCALE as _, 9)),
                _ => (*field).clone(),
            })
            .collect::<Vec<_>>(),
    ))
}

pub fn main() -> Result<()> {
    std::env::args()
        .skip(1)
        .collect::<Vec<_>>()
        //.into_par_iter()
        .into_iter()
        .try_for_each(|arg| {
            let file_path = PathBuf::from(arg);
            println!("reading {}", file_path.display());
            let file = File::open(&file_path)?;
            let reader_builder = ArrowReaderBuilder::try_new(file)?;
            let projection = ProjectionMask::named_roots(
                reader_builder.file_metadata().root_data_type(),
                ["date"].as_slice(),
            );
            let reader_builder = reader_builder
                .with_projection(projection)
                .with_batch_size(1024);
            let schema = transform_schema(&reader_builder.schema());
            let reader = reader_builder.with_schema(schema).build();
            for (i, chunk) in reader.enumerate() {
                let chunk = chunk.with_context(|| {
                    format!("Could not read chunk {} of {}", i, file_path.display())
                })?;
                //println!("{:?}", chunk);
            }

            Ok(())
        })
}

which prints:

reading /srv/softwareheritage/ssd/data/vlorentz/datasets/2024-05-16/orc/revision/revision-0c45576a-59f7-48d1-a9a8-2e5c64098905.orc
patch_bit_width= 40 value_bit_width= 30 N::BYTE_SIZE= 8
Error: Could not read chunk 29525 of /srv/softwareheritage/ssd/data/vlorentz/datasets/2024-05-16/orc/revision/revision-0c45576a-59f7-48d1-a9a8-2e5c64098905.orc

Caused by:
    0: External error: Out of spec, message: combined patch width and value width cannot exceed the size of the integer type being decoded
    1: Out of spec, message: combined patch width and value width cannot exceed the size of the integer type being decoded

Stack backtrace:
   0: anyhow::context::<impl anyhow::Context<T,E> for core::result::Result<T,E>>::with_context
   1: repro::main
   2: std::sys_common::backtrace::__rust_begin_short_backtrace
   3: std::rt::lang_start::{{closure}}
   4: core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &F>::call_once
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ops/function.rs:284:13
   5: std::panicking::try::do_call
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panicking.rs:552:40
   6: std::panicking::try
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panicking.rs:516:19
   7: std::panic::catch_unwind
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panic.rs:142:14
   8: std::rt::lang_start_internal::{{closure}}
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/rt.rs:148:48
   9: std::panicking::try::do_call
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panicking.rs:552:40
  10: std::panicking::try
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panicking.rs:516:19
  11: std::panic::catch_unwind
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panic.rs:142:14
  12: std::rt::lang_start_internal
             at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/rt.rs:148:20
  13: main
  14: __libc_start_main
             at ./csu/../csu/libc-start.c:308:16
  15: _start