gvinciguerra / PGM-index

🏅State-of-the-art learned data structure that enables fast lookup, predecessor, range searches and updates in arrays of billions of items using orders of magnitude less space than traditional indexes
https://pgm.di.unipi.it/
Apache License 2.0
775 stars 91 forks source link

Multiple-level Dynamic PGM-index #28

Closed authwork closed 3 years ago

authwork commented 3 years ago

@gvinciguerra I have done a test among unordered_map, ALEX, and pgm_index. The result is very strange

It costs 932159
Hash table size is 175731056
It costs 8694683
alex size is 1575900636
Killed # pgm test for too large memory consumption

Is there any suggestion for designing Multiple-level Dynamic PGM-index? Many thanks in advance. If you would like to have a try, please use

g++ -std=c++17 -fopenmp -I./PGM-index/include/ -I./ALEX/src/ bench.cpp -march=native -Wall -Wextra
#include <iostream>
#include <sys/random.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include <time.h>
#include <vector>
#include <iostream>
#include <algorithm>
#include <unordered_map>
#include <sys/time.h>
#include "pgm/pgm_index.hpp"
#include "pgm/pgm_index_dynamic.hpp"
#include "pgm/pgm_index_variants.hpp"
#include "core/alex.h"

struct KKK {
    uint64_t a;
    uint32_t b, c, d;
};

struct VVV{
    uint64_t a, b, c;
};

typedef pgm::DynamicPGMIndex<uint32_t, VVV> u32P;
typedef pgm::DynamicPGMIndex<uint64_t, u32P*> u64PGM2;
typedef pgm::DynamicPGMIndex<uint64_t, u64PGM2*> u64PGM1;

void pgm_test(std::vector<KKK> key, std::vector<VVV> value){
    u64PGM1 table;
    int number = key.size();
    struct timeval start, end;
    gettimeofday(&start, NULL);

    for (int i = 0; i < number; i++) {
        uint64_t k1 = key[i].a;
        uint64_t k2 = ((uint64_t)key[i].b) << 32 + (uint64_t)key[i].c;
        uint32_t k3 = key[i].d;
        auto iter = table.find(k1);
        if(iter == table.end()){
            table.insert_or_assign(k1, new u64PGM2());
            iter = table.find(k1);
        }
        auto p2 = iter->second;
        auto iter1 = p2->find(k2);
        if(iter1 == p2->end()){
            p2->insert_or_assign(k2, new u32P());
            iter1 = p2->find(k2);
        }
        iter1->second->insert_or_assign(k3, value[i]);
    }
    for (int i = 0; i < number; i++) {
        uint64_t k1 = key[i].a;
        uint64_t k2 = ((uint64_t)key[i].b) << 32 + (uint64_t)key[i].c;
        uint32_t k3 = key[i].d;
        auto iter = table.find(k1)->second->find(k2)->second->find(k3);
        VVV b = iter->second;
        if (b.a != value[i].a || b.b != value[i].b
                || b.c != value[i].c) {
            printf("failed\n");
            return;
        }
    }
    gettimeofday(&end, NULL);
    int time_len = 1000000 * (end.tv_sec - start.tv_sec)
            + (end.tv_usec - start.tv_usec);
    std::cout << "It costs " << time_len << std::endl;
    uint64_t size = table.size_in_bytes();

    for (auto &e : table) {
        size += e.second->size_in_bytes();
        for(auto &e1 : *(e.second)){
            size += e1.second->size_in_bytes();
        }
    }
    printf("pgm size is %lld\n", size);
}

typedef alex::Alex<uint32_t, VVV> u32A;
typedef alex::Alex<uint64_t, u32A*> u64P2;
typedef alex::Alex<uint64_t, u64P2*> u64P1;

void alex_test(std::vector<KKK> key,
        std::vector<VVV> value) {
    u64P1 table;
    int number = key.size();
    struct timeval start, end;
    gettimeofday(&start, NULL);
    for (int i = 0; i < number; i++) {
        uint64_t k1 = key[i].a;
        uint64_t k2 = ((uint64_t)key[i].b) << 32 + (uint64_t)key[i].c;
        uint32_t k3 = key[i].d;
        auto iter = table.find(k1);
        if (iter == table.end()) {
            table.insert(k1, new u64P2());
            iter = table.find(k1);
        }
        auto p2 = iter.payload();
        auto iter1 = p2->find(k2);
        if (iter1 == p2->end()) {
            p2->insert(k2, new u32A());
            iter1 = p2->find(k2);
        }
        iter1.payload()->insert(k3, value[i]);
    }
    for (int i = 0; i < number; i++) {
        uint64_t k1 = key[i].a;
        uint64_t k2 = ((uint64_t)key[i].b) << 32 + (uint64_t)key[i].c;
        uint32_t k3 = key[i].d;
        auto iter = table.find(k1).payload()->find(k2).payload()->find(k3);
        VVV b = iter.payload();
        if (b.a != value[i].a || b.b != value[i].b
                || b.c != value[i].c) {
            printf("failed\n");
            return;
        }
    }
    gettimeofday(&end, NULL);
    int time_len = 1000000 * (end.tv_sec - start.tv_sec)
            + (end.tv_usec - start.tv_usec);
    std::cout << "It costs " << time_len << std::endl;

    uint64_t size = table.data_size() + table.model_size();

    for (auto e = table.begin(); e != table.end(); e++) {
        auto t2 = e.payload();
        size += t2->data_size();
        size += t2->model_size();
        for(auto e1 = t2->begin(); e1 != t2->end(); e1++){
            size += e1.payload()->data_size();
            size += e1.payload()->model_size();
        }
    }
    printf("alex size is %lld\n", size);
}

struct TupleHasher {
    std::size_t
    operator()(const KKK &key) const {
        return key.a;
    }
};

struct TupleEqualer {
    bool operator()(const KKK &lhs, const KKK &rhs) const {
        return (lhs.a == rhs.a) && (lhs.b == rhs.b) && (lhs.c == rhs.c) && (lhs.d == rhs.d);
    }
};

void hashmap_test(std::vector<KKK> key,
        std::vector<VVV> value) {
    std::unordered_map<KKK, VVV, TupleHasher, TupleEqualer> table;
    int number = key.size();
    struct timeval start, end;
    gettimeofday(&start, NULL);
    for (int i = 0; i < number; i++) {
        table[key[i]] = value[i];
    }
    for (int i = 0; i < number; i++) {
        VVV b = table[key[i]];
        if (b.a != value[i].a || b.b != value[i].b
                || b.c != value[i].c) {
            printf("failed\n");
            return;
        }
    }
    gettimeofday(&end, NULL);
    int time_len = 1000000 * (end.tv_sec - start.tv_sec)
            + (end.tv_usec - start.tv_usec);
    std::cout << "It costs " << time_len << std::endl;
    uint64_t size = 0;

    for (unsigned i = 0; i < table.bucket_count(); ++i) {
        size_t bucket_size = table.bucket_size(i);
        if (bucket_size == 0) {
            size += sizeof(void*);
        } else {
            size += bucket_size * (sizeof(KKK) + sizeof(VVV));
            size += (bucket_size - 1) * sizeof(void*);
        }
    }

    printf("Hash table size is %lld\n", size);
}

int main(){
    std::vector<KKK> x_key;
    std::vector<VVV> x_value;
    for(int i=0; i<3199807; i++){
        KKK a;
        VVV b;
               // you can also generate them randomly
        a.a = i >> 96;
        a.b = i >> 64;
        a.c = i >> 32;
        a.d = i;
        b.a = 1 * i;
        b.b = 2 * i;
        b.c = 3 * i;
        x_key.push_back(a);
        x_value.push_back(b);
    }
    hashmap_test(x_key, x_value);
    alex_test(x_key, x_value);
    pgm_test(x_key, x_value);
}
gvinciguerra commented 3 years ago

I commented out the code related to hashmap_test and alex_test, and the program exits with code 0 after printing

It costs 1170674
pgm size is 92796943

So it does not seem related to pgm_test...

Also, please do not open multiple issues for the same question (#26 #27), I try my best to answer when I can.

authwork commented 3 years ago

@gvinciguerra I do as you suggested to comment out the code related to hashmap_test and alex_test. But it still got killed. Very strange:( I try this code with g++9 and g++7 on different ubuntu systems. I am using version 4cdd5de6a941de4653da04e8cecd299e62b47788

#include <iostream>
#include <sys/random.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include <time.h>
#include <vector>
#include <iostream>
#include <algorithm>
#include <unordered_map>
#include <sys/time.h>
#include "pgm/pgm_index.hpp"
#include "pgm/pgm_index_dynamic.hpp"
#include "pgm/pgm_index_variants.hpp"

struct KKK {
    uint64_t a;
    uint32_t b, c, d;
};

struct VVV{
    uint64_t a, b, c;
};

typedef pgm::DynamicPGMIndex<uint32_t, VVV> u32P;
typedef pgm::DynamicPGMIndex<uint64_t, u32P*> u64PGM2;
typedef pgm::DynamicPGMIndex<uint64_t, u64PGM2*> u64PGM1;

void pgm_test(std::vector<KKK> key, std::vector<VVV> value){
    u64PGM1 table;
    int number = key.size();
    struct timeval start, end;
    gettimeofday(&start, NULL);

    for (int i = 0; i < number; i++) {
        uint64_t k1 = key[i].a;
        uint64_t k2 = ((uint64_t)key[i].b) << 32 + (uint64_t)key[i].c;
        uint32_t k3 = key[i].d;
        auto iter = table.find(k1);
        if(iter == table.end()){
            table.insert_or_assign(k1, new u64PGM2());
            iter = table.find(k1);
        }
        auto p2 = iter->second;
        auto iter1 = p2->find(k2);
        if(iter1 == p2->end()){
            p2->insert_or_assign(k2, new u32P());
            iter1 = p2->find(k2);
        }
        iter1->second->insert_or_assign(k3, value[i]);
    }
    for (int i = 0; i < number; i++) {
        uint64_t k1 = key[i].a;
        uint64_t k2 = ((uint64_t)key[i].b) << 32 + (uint64_t)key[i].c;
        uint32_t k3 = key[i].d;
        auto iter = table.find(k1)->second->find(k2)->second->find(k3);
        VVV b = iter->second;
        if (b.a != value[i].a || b.b != value[i].b
                || b.c != value[i].c) {
            printf("failed\n");
            return;
        }
    }
    gettimeofday(&end, NULL);
    int time_len = 1000000 * (end.tv_sec - start.tv_sec)
            + (end.tv_usec - start.tv_usec);
    std::cout << "It costs " << time_len << std::endl;
    uint64_t size = table.size_in_bytes();

    for (auto &e : table) {
        size += e.second->size_in_bytes();
        for(auto &e1 : *(e.second)){
            size += e1.second->size_in_bytes();
        }
    }
    printf("pgm size is %lld\n", size);
}

int main(){
    std::vector<KKK> x_key;
    std::vector<VVV> x_value;
    for(int i=0; i<3199807; i++){
        KKK a;
        VVV b;
        a.a = i >> 96;
        a.b = i >> 64;
        a.c = i >> 32;
        a.d = i;
        b.a = 1 * i;
        b.b = 2 * i;
        b.c = 3 * i;
        x_key.push_back(a);
        x_value.push_back(b);
    }
    pgm_test(x_key, x_value);
}
authwork commented 3 years ago

update:

On a machine with around 500G memory, I finish the test with output:

It costs 1648490
Hash table size is 175731056
It costs 15764876
alex size is 1575900636
It costs 75040896
pgm size is 4649320779

With only pgm_test, I got

It costs 69553765
pgm size is 4649320779

@gvinciguerra Why my result is too far away from what you got? T_T

authwork commented 3 years ago

@gvinciguerra update

It costs 1133163
alex size is 24442892
It costs 3632979
pgm size is 16984181

I found a mistake in my code,

        a.a = uint64_t((__uint128_t)i >> 96);
        a.b = uint32_t((__uint128_t)i >> 64);
        a.c = uint32_t((__uint128_t)i >> 32);
        a.d = i;

This will produce the correct result.