datafusion-contrib / orc-rust

Rust implementation of Apache ORC
Apache License 2.0
11 stars 5 forks source link

Incorrect Timestamp decoding into Decimal128 since v0.4.0 #14

Open progval opened 4 weeks ago

progval commented 4 weeks ago

repro code (add it to tests/basic/main.rs):

fn integration_path(path: &str) -> String {
    let dir = env!("CARGO_MANIFEST_DIR");
    format!("{}/tests/integration/data/{}", dir, path)
}

#[test]
pub fn decimal128_timestamps_1900_test() {
    let path = integration_path("TestOrcFile.testDate1900.orc");
    let f = File::open(path).expect("no file found");
    let mut reader = ArrowReaderBuilder::try_new(f)
        .unwrap()
        .with_batch_size(10) // it's a big file, we don't want to test more than that
        .with_schema(Arc::new(Schema::new(vec![
            Field::new(
                "time",
                DataType::Decimal128(38, 9),
                true,
            ),
            Field::new(
                "date",
                DataType::Date32,
                true,
            ),
        ])))
        .build();
    let batch = reader.next().unwrap().unwrap();
    println!("{:?}", batch.column(0));
    let expected = [
        "+-----------------------+------------+",
        "| time                  | date       |",
        "+-----------------------+------------+",
        "| -2198229903.900000000 | 1900-12-25 |",
        "| -2198229903.899900000 | 1900-12-25 |",
        "| -2198229903.899800000 | 1900-12-25 |",
        "| -2198229903.899700000 | 1900-12-25 |",
        "| -2198229903.899600000 | 1900-12-25 |",
        "| -2198229903.899500000 | 1900-12-25 |",
        "| -2198229903.899400000 | 1900-12-25 |",
        "| -2198229903.899300000 | 1900-12-25 |",
        "| -2198229903.899200000 | 1900-12-25 |",
        "| -2198229903.899100000 | 1900-12-25 |",
        "+-----------------------+------------+",
    ];
    assert_batches_eq(&[batch], &expected);
}

errors with:

expected:

[
    "+-----------------------+------------+",
    "| time                  | date       |",
    "+-----------------------+------------+",
    "| -2198229903.900000000 | 1900-12-25 |",
    "| -2198229903.899900000 | 1900-12-25 |",
    "| -2198229903.899800000 | 1900-12-25 |",
    "| -2198229903.899700000 | 1900-12-25 |",
    "| -2198229903.899600000 | 1900-12-25 |",
    "| -2198229903.899500000 | 1900-12-25 |",
    "| -2198229903.899400000 | 1900-12-25 |",
    "| -2198229903.899300000 | 1900-12-25 |",
    "| -2198229903.899200000 | 1900-12-25 |",
    "| -2198229903.899100000 | 1900-12-25 |",
    "+-----------------------+------------+",
]
actual:

[
    "+-----------------------+------------+",
    "| time                  | date       |",
    "+-----------------------+------------+",
    "| 107616705.213693951   | 1900-12-25 |",
    "| -2198229899.000000000 | 1900-12-25 |",
    "| -2198229902.999499000 | 1900-12-25 |",
    "| -2198229898.990000000 | 1900-12-25 |",
    "| -2198229902.999498000 | 1900-12-25 |",
    "| -2198229898.980000000 | 1900-12-25 |",
    "| -2198229902.999497000 | 1900-12-25 |",
    "| -2198229898.970000000 | 1900-12-25 |",
    "| -2198229902.999496000 | 1900-12-25 |",
    "| -2198229898.960000000 | 1900-12-25 |",
    "+-----------------------+------------+",
]

by replacing .with_batch_size(10) with .with_batch_size(20), we even get an overflow crash:

[src/reader/decode/timestamp.rs:67:5] &base = 1420099200
[src/reader/decode/timestamp.rs:67:5] &seconds_since_orc_base = Ok(
    -3618300303,
)
[src/reader/decode/timestamp.rs:67:5] &nanoseconds = Ok(
    -407,
)
thread 'decimal128_timestamps_1900_test' panicked at src/reader/decode/timestamp.rs:76:9:
attempt to multiply with overflow

Both are caused by https://github.com/datafusion-contrib/datafusion-orc/commit/9dd640c8e93644e73fa39c78ea88e92b4166f2e5

cc @Jefffrey

Xuanwo commented 4 weeks ago

Should this issue be moved to orc-rust?

progval commented 4 weeks ago

Yes