quickwit-oss / bitpacking

SIMD algorithms for integer compression via bitpacking. This crate is a port of a C library called simdcomp.
MIT License
272 stars 30 forks source link

BitPacker4x incorrectly encodes bit width 32 sorted blocks #17

Closed danburkert closed 5 years ago

danburkert commented 5 years ago

Easiest way to explain this is through a repro:

#[test]
fn test_delta_bit_width_32() {
    use bitpacking::BitPacker4x;
    let values = vec![i32::max_value() as u32 + 1; BitPacker4x::BLOCK_LEN];
    let bit_packer = BitPacker4x::new();
    let bit_width = bit_packer.num_bits_sorted(0, &values);
    assert_eq!(bit_width, 32);

    let mut block = vec![0u8; BitPacker4x::compressed_block_size(bit_width)];
    bit_packer.compress_sorted(0, &values, &mut block, bit_width);

    let mut decoded_values = vec![0x10101010; BitPacker4x::BLOCK_LEN];
    bit_packer.decompress_sorted(0, &block, &mut decoded_values, bit_width);

    assert_eq!(values, decoded_values);
}

fails with:

thread 'page::bitpacked::tests::test_delta_bit_width_32' panicked at 'assertion failed: `(left == right)`
  left: `[2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648]`,
 right: `[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]`', data/src/page/bitpacked.rs:296:9
fulmicoton commented 5 years ago

Thanks for the great bug report! i was able to reproduce rapidly.

danburkert commented 5 years ago

There is a similar failure for non-sorted blocks when the block contains a small value:

#[test]
fn test_bit_width_32() {
    use bitpacking::BitPacker4x;
    let mut values = vec![i32::max_value() as u32 + 1; BitPacker4x::BLOCK_LEN];
    values[0] = 0;
    let bit_packer = BitPacker4x::new();
    let bit_width = bit_packer.num_bits(&values);
    assert_eq!(bit_width, 32);

    let mut block = vec![0u8; BitPacker4x::compressed_block_size(bit_width)];
    bit_packer.compress(&values, &mut block, bit_width);

    let mut decoded_values = vec![0x10101010; BitPacker4x::BLOCK_LEN];
    bit_packer.decompress(&block, &mut decoded_values, bit_width);

    assert_eq!(values, decoded_values);
}

fails with:

thread 'page::bitpacked::tests::test_bit_width_32' panicked at 'assertion failed: `(left == right)`
  left: `[0, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648]`,
 right: `[2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648, 2147483648]`', data/src/page/bitpacked.rs:308:9
fulmicoton commented 5 years ago

Pure curiosity, how did you discover that?

danburkert commented 5 years ago

Proptest

fulmicoton commented 5 years ago

Sweet! If you have time could you contribute your code?

danburkert commented 5 years ago

The tests that found these are over a bigger data structure which uses bitpacking as a building block, but it's not too difficult to create block specific tests:

proptest! {
    #[test]
    fn check_block(
        values in prop::collection::vec(u32::arbitrary(), BitPacker4x::BLOCK_LEN),
    ) {
        let bit_packer = BitPacker4x::new();
        let bit_width = bit_packer.num_bits(&*values);

        let mut block = vec![0u8; BitPacker4x::compressed_block_size(bit_width)];
        bit_packer.compress(&values, &mut block, bit_width);

        let mut decoded_values = vec![0x10101010; BitPacker4x::BLOCK_LEN];
        bit_packer.decompress(&block, &mut decoded_values, bit_width);

        prop_assert_eq!(values, decoded_values);
    }

    #[test]
    fn check_sorted_block(
        (mut values, init_value) in prop::collection::vec(u32::arbitrary(), BitPacker4x::BLOCK_LEN).prop_flat_map(|mut values| {
            values.sort();
            let min_value = values[0];
            (Just(values), (0..=min_value))
        }),
    ) {
        let bit_packer = BitPacker4x::new();
        let bit_width = bit_packer.num_bits_sorted(init_value, &*values);

        let mut block = vec![0u8; BitPacker4x::compressed_block_size(bit_width)];
        bit_packer.compress_sorted(init_value, &values, &mut block, bit_width);

        let mut decoded_values = vec![0x10101010; BitPacker4x::BLOCK_LEN];
        bit_packer.decompress_sorted(init_value, &block, &mut decoded_values, bit_width);

        prop_assert_eq!(values, decoded_values);
    }
}

The non-sorted check repros the second issue; I haven't gotten the sorted check to repro the original issue, I'm guessing because the sort space is too large.

danburkert commented 5 years ago

(feel free to use/adapt that for bitpacking)

fulmicoton commented 5 years ago

Thanks a lot!

fulmicoton commented 5 years ago

Fixed in 0.8.1. Yanked 0.8.

This bug does not affect tantivy as DocId are limited to 31 bits.

danburkert commented 5 years ago

Thanks for the quick turnaround!