diff --git a/Cargo.toml b/Cargo.toml
index 2af67e1..ecec249 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -69,6 +70,10 @@ required-features = ["datafusion"]
# Some issue when publishing and path isn't specified, so adding here
path = "./examples/datafusion_integration.rs"
+[[example]]
+name = "repro"
+required-features = ["cli"]
+
[[bin]]
name = "orc-metadata"
required-features = ["cli"]
diff --git a/src/reader/decode/rle_v2/patched_base.rs b/src/reader/decode/rle_v2/patched_base.rs
index c33815b..c149ea4 100644
--- a/src/reader/decode/rle_v2/patched_base.rs
+++ b/src/reader/decode/rle_v2/patched_base.rs
@@ -53,6 +53,7 @@ impl<N: NInt, R: Read> RleReaderV2<N, R> {
.fail();
}
if (patch_bit_width + value_bit_width) > (N::BYTE_SIZE * 8) {
+ eprintln!("patch_bit_width= {} value_bit_width= {} N::BYTE_SIZE= {}", patch_bit_width, value_bit_width, N::BYTE_SIZE);
return OutOfSpecSnafu {
msg: "combined patch width and value width cannot exceed the size of the integer type being decoded",
}
then run this code with ./revision-0c45576a-59f7-48d1-a9a8-2e5c64098905.orc as parameter:
use std::fs::File;
use std::path::PathBuf;
use std::sync::Arc;
use anyhow::{Context, Result};
use arrow::datatypes::{DataType, Decimal128Type, DecimalType, Schema};
use orc_rust::arrow_reader::ArrowReaderBuilder;
use orc_rust::projection::ProjectionMask;
//use rayon::prelude::*;
fn transform_schema(schema: &Schema) -> Arc<Schema> {
Arc::new(Schema::new(
schema
.fields()
.iter()
.cloned()
.map(|field| match field.data_type() {
DataType::Timestamp(_, _) => (*field)
.clone()
//.with_data_type(DataType::Timestamp(TimeUnit::Microsecond, tz.clone())),
.with_data_type(DataType::Decimal128(Decimal128Type::MAX_SCALE as _, 9)),
_ => (*field).clone(),
})
.collect::<Vec<_>>(),
))
}
pub fn main() -> Result<()> {
std::env::args()
.skip(1)
.collect::<Vec<_>>()
//.into_par_iter()
.into_iter()
.try_for_each(|arg| {
let file_path = PathBuf::from(arg);
println!("reading {}", file_path.display());
let file = File::open(&file_path)?;
let reader_builder = ArrowReaderBuilder::try_new(file)?;
let projection = ProjectionMask::named_roots(
reader_builder.file_metadata().root_data_type(),
["date"].as_slice(),
);
let reader_builder = reader_builder
.with_projection(projection)
.with_batch_size(1024);
let schema = transform_schema(&reader_builder.schema());
let reader = reader_builder.with_schema(schema).build();
for (i, chunk) in reader.enumerate() {
let chunk = chunk.with_context(|| {
format!("Could not read chunk {} of {}", i, file_path.display())
})?;
//println!("{:?}", chunk);
}
Ok(())
})
}
which prints:
reading /srv/softwareheritage/ssd/data/vlorentz/datasets/2024-05-16/orc/revision/revision-0c45576a-59f7-48d1-a9a8-2e5c64098905.orc
patch_bit_width= 40 value_bit_width= 30 N::BYTE_SIZE= 8
Error: Could not read chunk 29525 of /srv/softwareheritage/ssd/data/vlorentz/datasets/2024-05-16/orc/revision/revision-0c45576a-59f7-48d1-a9a8-2e5c64098905.orc
Caused by:
0: External error: Out of spec, message: combined patch width and value width cannot exceed the size of the integer type being decoded
1: Out of spec, message: combined patch width and value width cannot exceed the size of the integer type being decoded
Stack backtrace:
0: anyhow::context::<impl anyhow::Context<T,E> for core::result::Result<T,E>>::with_context
1: repro::main
2: std::sys_common::backtrace::__rust_begin_short_backtrace
3: std::rt::lang_start::{{closure}}
4: core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &F>::call_once
at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ops/function.rs:284:13
5: std::panicking::try::do_call
at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panicking.rs:552:40
6: std::panicking::try
at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panicking.rs:516:19
7: std::panic::catch_unwind
at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panic.rs:142:14
8: std::rt::lang_start_internal::{{closure}}
at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/rt.rs:148:48
9: std::panicking::try::do_call
at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panicking.rs:552:40
10: std::panicking::try
at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panicking.rs:516:19
11: std::panic::catch_unwind
at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/panic.rs:142:14
12: std::rt::lang_start_internal
at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/std/src/rt.rs:148:20
13: main
14: __libc_start_main
at ./csu/../csu/libc-start.c:308:16
15: _start
this check:
https://github.com/datafusion-contrib/datafusion-orc/blob/16b57041e821bebeee205f09e684e616416db767/src/reader/decode/rle_v2/patched_base.rs#L55-L60
can be falsified while reading files created with
pyorc
. To reproduce:(sorry, it's 4GB. I don't have a smaller example on hand)
then checkout 28e911bdf815bdcd8cc1225a4afc0627902400ef (a commit from https://github.com/datafusion-contrib/datafusion-orc/pull/96 because it's the only way not to hit an overflow crash before this bug), apply this patch:
then run this code with
./revision-0c45576a-59f7-48d1-a9a8-2e5c64098905.orc
as parameter:which prints: