facebook / rocksdb

A library that provides an embeddable, persistent key-value store for fast storage.
http://rocksdb.org
GNU General Public License v2.0
28.59k stars 6.32k forks source link

rocksdb::BlockBasedTableOptions::kDataBlockBinaryAndHash returns incorrect value with timestamp #12100

Open peroket opened 11 months ago

peroket commented 11 months ago

If using timestamp and rocksdb::BlockBasedTableOptions::kDataBlockBinaryAndHash, the returned value in Get() is not the latest value.

Expected behavior

Enabling the index should not affect the returned value.

Actual behavior

The returned value is not the most recent one.

Steps to reproduce the behavior

Tested with rocksdb 8.8.1 and catch2.

// Taken from rocksdb
rocksdb::Slice StripTimestampFromUserKey(const rocksdb::Slice& user_key, const size_t ts_sz) noexcept {
    rocksdb::Slice ret = user_key;
    ret.remove_suffix(ts_sz);
    return ret;
}

// Taken from rocksdb
rocksdb::Slice ExtractTimestampFromUserKey(const rocksdb::Slice& user_key, const size_t ts_sz) noexcept {
    assert(user_key.size() >= ts_sz);
    return {user_key.data() + user_key.size() - ts_sz, ts_sz};
}

// Taken from rocksdb
void EncodeFixed32(char* buf, uint32_t value) noexcept {
    buf[0] = static_cast<char>(value & 0xff);
    buf[1] = static_cast<char>((value >> 8) & 0xff);
    buf[2] = static_cast<char>((value >> 16) & 0xff);
    buf[3] = static_cast<char>((value >> 24) & 0xff);
}

// Taken from rocksdb
uint32_t DecodeFixed32(const char* ptr) noexcept {
    return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0]))) |
            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8) |
            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16) |
            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
}

// Taken from rocksdb https://github.com/facebook/rocksdb/blob/main/util/comparator.cc#L237
class ComparatorWithTimestamp : public rocksdb::Comparator {
  public:
    explicit ComparatorWithTimestamp() noexcept : Comparator(/*ts_sz=*/sizeof(uint32_t)) {
        assert(cmp_without_ts_->timestamp_size() == 0);
    }

    // The comparator that compares the user key without timestamp part is treated
    // as the root comparator.
    [[nodiscard]] const Comparator* GetRootComparator() const override { return cmp_without_ts_; }

    void FindShortSuccessor(std::string* /*key*/) const override {}
    void FindShortestSeparator(std::string* /*start*/, const rocksdb::Slice& /*limit*/) const override {}
    [[nodiscard]] int Compare(const rocksdb::Slice& a, const rocksdb::Slice& b) const override {
        const int ret = CompareWithoutTimestamp(a, b);
        const size_t ts_sz = timestamp_size();
        if (ret != 0) {
            return ret;
        }
        // Compare timestamp.
        // For the same user key with different timestamps, larger (newer) timestamp
        // comes first.
        return -CompareTimestamp(ExtractTimestampFromUserKey(a, ts_sz), ExtractTimestampFromUserKey(b, ts_sz));
    }
    using Comparator::CompareWithoutTimestamp;
    [[nodiscard]] int CompareWithoutTimestamp(const rocksdb::Slice& a, bool a_has_ts, const rocksdb::Slice& b,
                                              bool b_has_ts) const override {
        const size_t ts_sz = timestamp_size();
        assert(!a_has_ts || a.size() >= ts_sz);
        assert(!b_has_ts || b.size() >= ts_sz);
        const rocksdb::Slice lhs = a_has_ts ? StripTimestampFromUserKey(a, ts_sz) : a;
        const rocksdb::Slice rhs = b_has_ts ? StripTimestampFromUserKey(b, ts_sz) : b;
        return cmp_without_ts_->Compare(lhs, rhs);
    }
    [[nodiscard]] int CompareTimestamp(const rocksdb::Slice& ts1, const rocksdb::Slice& ts2) const override {
        assert(ts1.size() == sizeof(uint32_t));
        assert(ts2.size() == sizeof(uint32_t));
        const auto lhs = DecodeFixed32(ts1.data());
        const auto rhs = DecodeFixed32(ts2.data());
        if (lhs < rhs) {
            return -1;
        } else if (lhs > rhs) {
            return 1;
        } else {
            return 0;
        }
    }
    [[nodiscard]] bool CanKeysWithDifferentByteContentsBeEqual() const override { return false; }

    [[nodiscard]] const char* Name() const override { return "name"; }

  private:
    const rocksdb::Comparator* cmp_without_ts_ = rocksdb::BytewiseComparator();
};

TEST_CASE("Timestamp bug") {
    const auto* path = ...; // set your path

    rocksdb::DB* db_ptr{};
    rocksdb::Options options;
    ComparatorWithTimestamp cmp;
    options.create_if_missing = true;
    options.comparator = &cmp;

    rocksdb::BlockBasedTableOptions table_options;
    // Comment this line to make the test pass
    table_options.data_block_index_type = rocksdb::BlockBasedTableOptions::kDataBlockBinaryAndHash;
    options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(table_options));

    REQUIRE(rocksdb::DB::Open(options, path, &db_ptr).ok());
    const std::unique_ptr<rocksdb::DB> db(db_ptr);

    constexpr size_t count = 10'000;
    std::string timestamp(4, '\0');
    std::string key(32, '\0');

    for (uint32_t i = 0; i < count; ++i) {
        EncodeFixed32(timestamp.data(), i);
        db->Put(rocksdb::WriteOptions(), key, timestamp, std::to_string(i));
    }
    db->Flush(rocksdb::FlushOptions());

    std::string result;
    rocksdb::ReadOptions read_options;
    const rocksdb::Slice slice(timestamp);
    EncodeFixed32(timestamp.data(), count);
    read_options.timestamp = &slice;
    const auto status = db->Get(read_options, key, &result);
    REQUIRE(status.ok());
    REQUIRE(result == std::to_string(count - 1)); // This fails and return 9823 instead of 9999

}
jowlyzhang commented 11 months ago

Thank you for reporting this issue. This feature combination: user-defined timestamp and kDataBlockBinaryAndHash is indeed not compatible. We internally disable this feature combination by making the Comparator:: CanKeysWithDifferentByteContentsBeEqual() return true.

dorlevi commented 5 months ago

@jowlyzhang : Do you have any idea when/if this will get prioritized?

jowlyzhang commented 5 months ago

This work is not currently planned.

brennan913 commented 5 days ago

I'm interesting in working on this! As a first step the plan would be something like the following:

The idea is to ensure correct/expected results, although without getting the CPU/throughput improvements from the hash index.

From there, I'd like to experiment with strategies for using/modifying the hash index in a timestamp-aware way to recoup at least some of these benefits in the presence of user-defined timestamps.

jowlyzhang commented 1 day ago

@brennan913 thank you for offering to add improvements! Just want to provide a bit more context in order to dive in what specific areas we want to improve.

We are using this API as the source of truth for whether data block hash index is supported: https://github.com/facebook/rocksdb/blob/24045549a61d755418d5e9e6b208b2bfd77093b8/include/rocksdb/comparator.h#L111-L115

Data block's hash index is supported if the comparator defines this function to return false, and specify table_options.data_block_index_type to be kDataBlockBinaryAndHash. This logic is here: https://github.com/facebook/rocksdb/blob/24045549a61d755418d5e9e6b208b2bfd77093b8/table/block_based/block_based_table_builder.cc#L467-L470

RocksDB's built in timestamp aware comparator doesn't support data block hash index, so its implementation of CanKeysWithDifferentByteContentsBeEqual explicitly return true. So for the built in comparator, we don't need to do more things to make sure if timestamp is provided, bypass the hash index even if it's enabled.

If you mean to implement this for other customized comparators, it's better to continue to use the same API.

As for "if no timestamp is provided, check the hash index if it's enabled", no specific things need to be done here. If hash index is used to build a data block, it's encoded into the block and will be used when reading the block. For example, see code snippet here: https://github.com/facebook/rocksdb/blob/24045549a61d755418d5e9e6b208b2bfd77093b8/table/block_based/block.cc#L1083-L1094