radareorg / sdb

Simple and fast string based key-value database with support for arrays and json
https://www.radare.org/
MIT License
218 stars 62 forks source link

SdbHt: reintroduce growing #160

Closed ret2libc closed 5 years ago

radare commented 6 years ago

there are build errors. not gonna review something that is not even building

cd api && make
make[2]: Entering directory `/home/travis/build/radare/sdb/test/api'
/tmp/cclxC58f.o: In function `main':
array.c:(.text+0x12d7): multiple definition of `main'
/tmp/ccrn7qoh.o:array.c:(.text+0x12d7): first defined here
collect2: error: ld returned 1 exit status
make[2]: *** [all] Error 1
make[2]: Leaving directory `/home/travis/build/radare/sdb/test/api'
make[1]: [all] Error 2 (ignored)
make[2]: Entering directory `/home/travis/build/radare/sdb/test/unit'
gcc -I../../src -g -O0 test_array.c -o test_array ../../src/libsdb.a
In file included from test_array.c:1:0:
minunit.h:25:23: fatal error: sdb/types.h: No such file or directory
 #include <sdb/types.h>
                       ^
compilation terminated.
make[2]: *** [test_array] Error 1
make[2]: Leaving directory `/home/travis/build/radare/sdb/test/unit'
make[1]: *** [all] Error 2
make[1]: Leaving directory `/home/travis/build/radare/sdb/test'
make: *** [test] Error 2
ret2libc commented 6 years ago

there are build errors. not gonna review something that is not even building

it is building... as you can see the make exits with 0. It's just because this PR depends on https://github.com/radare/sdb/pull/158 to make tests work.

radare commented 5 years ago

wat nao

ret2libc commented 5 years ago

I will rebase this soon!

ret2libc commented 5 years ago

Performance test:

#include "minunit.h"
#include <stdlib.h>
#include <sdb.h>
#include <time.h>

#define SEED 0xdeadbeef
#define INSERT_SZ   10000
#define FIND_SZ     1000000

#define INSERT_SZ2  10000000
#define FIND_SZ2    100000000

#define MOD_VAL    100000

void free_key(HtKv *kv) {
    free (kv->key);
    free (kv);
}

int main(int argc, char **argv) {
    // on master this defaults to allocating 1024 elements
    // on my PR it starts with 3 elements
    SdbHt *ht = ht_new (NULL, free_key, NULL);
    int i;

    clock_t start, end;
    double cpu_time_used, time_taken;

    // few elements
    srand(SEED);
    start = clock();
    for (i = 0; i < INSERT_SZ; ++i) {
        int val = rand() % MOD_VAL;
        char buf[20];
        snprintf (buf, 20, "%d", val);
        ht_insert (ht, buf, (void *)(ut64)val);
    }
    end = clock();
    time_taken = ((double)end - start)/CLOCKS_PER_SEC; // in seconds 
    printf("insert time: %f seconds\n", time_taken);

    long long sum = 0;
    start = clock();
    for (i = 0; i < FIND_SZ; ++i) {
        int val = rand() % MOD_VAL;
        char buf[20];
        snprintf (buf, 20, "%d", val);
        int res = (int)(ut64)ht_find (ht, buf, NULL);
        sum += res;
    }
    end = clock();
    time_taken = ((double)end - start)/CLOCKS_PER_SEC; // in seconds 
    printf("find time: %f seconds, sum = %lld\n", time_taken, sum);
    ht_free (ht);

    // many elements
    ht = ht_new (NULL, free_key, NULL);
    srand(SEED);
    start = clock();
    for (i = 0; i < INSERT_SZ2; ++i) {
        int val = rand() % MOD_VAL;
        char buf[20];
        snprintf (buf, 20, "%d", val);
        ht_insert (ht, buf, (void *)(ut64)val);
    }
    end = clock();
    time_taken = ((double)end - start)/CLOCKS_PER_SEC; // in seconds 
    printf("insert time (2 time): %f seconds\n", time_taken);

    sum = 0;
    start = clock();
    for (i = 0; i < FIND_SZ2; ++i) {
        int val = rand() % MOD_VAL;
        char buf[20];
        snprintf (buf, 20, "%d", val);
        int res = (int)(ut64)ht_find (ht, buf, NULL);
        sum += res;
    }
    end = clock();
    time_taken = ((double)end - start)/CLOCKS_PER_SEC; // in seconds 
    printf("find time (2 time): %f seconds, sum = %lld\n", time_taken, sum);

    ht_free (ht);
    return 0;
}

Results on master:

BASEDIR=$(pwd)/../../build-master make test_hash_performance ; ./test_hash_performance                                                         
cc -I/home/rschiron/projects/sdb/test/../src -I/home/rschiron/projects/sdb/test/unit/../../build-master  -g -O0 test_hash_performance.c -o test_hash_performance /home/rschiron/projects/sdb/test/unit/../../build-master/libsdb.a
insert time: 0.004317 seconds
find time: 0.413766 seconds, sum = 4738954741
insert time (2 time): 36.084946 seconds
find time (2 time): 348.878303 seconds, sum = 5000114468030

Results on my branch:

BASEDIR=$(pwd)/../../build make test_hash_performance ; ./test_hash_performance                                                                
cc -I/home/rschiron/projects/sdb/test/../src -I/home/rschiron/projects/sdb/test/unit/../../build  -g -O0 test_hash_performance.c -o test_hash_performance /home/rschiron/projects/sdb/test/unit/../../build/libsdb.a
insert time: 0.013456 seconds
find time: 0.175061 seconds, sum = 4738954741
insert time (2 time): 5.590546 seconds
find time (2 time): 43.648521 seconds, sum = 5000114468030

As you can see, this PR makes it just so much better in everything. And this is not even a fair comparison since the ht in my PR will start with 3 elements and not with 1024 as it is in master (which is many time a waste of space, as one of the issue has shown)

radare commented 5 years ago

The only waste is u allocate tons of sdb. Which is not the case. Fixed size is fine for most use cases

On 5 Oct 2018, at 19:00, Riccardo Schirone notifications@github.com wrote:

Performance test:

include "minunit.h"

include

include

include

define SEED 0xdeadbeef

define INSERT_SZ 10000

define FIND_SZ 1000000

define INSERT_SZ2 10000000

define FIND_SZ2 100000000

define MOD_VAL 100000

void free_key(HtKv *kv) { free (kv->key); free (kv); }

int main(int argc, char *argv) { // on master this defaults to allocating 1024 elements // on my PR it starts with 3 elements SdbHt ht = ht_new (NULL, free_key, NULL); int i;

clock_t start, end; double cpu_time_used, time_taken;

// few elements srand(SEED); start = clock(); for (i = 0; i < INSERT_SZ; ++i) { int val = rand() % MOD_VAL; char buf[20]; snprintf (buf, 20, "%d", val); ht_insert (ht, buf, (void *)(ut64)val); } end = clock(); time_taken = ((double)end - start)/CLOCKS_PER_SEC; // in seconds printf("insert time: %f seconds\n", time_taken);

long long sum = 0; start = clock(); for (i = 0; i < FIND_SZ; ++i) { int val = rand() % MOD_VAL; char buf[20]; snprintf (buf, 20, "%d", val); int res = (int)(ut64)ht_find (ht, buf, NULL); sum += res; } end = clock(); time_taken = ((double)end - start)/CLOCKS_PER_SEC; // in seconds printf("find time: %f seconds, sum = %lld\n", time_taken, sum); ht_free (ht);

// many elements ht = ht_new (NULL, free_key, NULL); srand(SEED); start = clock(); for (i = 0; i < INSERT_SZ2; ++i) { int val = rand() % MOD_VAL; char buf[20]; snprintf (buf, 20, "%d", val); ht_insert (ht, buf, (void *)(ut64)val); } end = clock(); time_taken = ((double)end - start)/CLOCKS_PER_SEC; // in seconds printf("insert time (2 time): %f seconds\n", time_taken);

sum = 0; start = clock(); for (i = 0; i < FIND_SZ2; ++i) { int val = rand() % MOD_VAL; char buf[20]; snprintf (buf, 20, "%d", val); int res = (int)(ut64)ht_find (ht, buf, NULL); sum += res; } end = clock(); time_taken = ((double)end - start)/CLOCKS_PER_SEC; // in seconds printf("find time (2 time): %f seconds, sum = %lld\n", time_taken, sum);

ht_free (ht); return 0; } Results on master:

BASEDIR=$(pwd)/../../build-master make test_hash_performance ; ./test_hash_performance
cc -I/home/rschiron/projects/sdb/test/../src -I/home/rschiron/projects/sdb/test/unit/../../build-master -g -O0 test_hash_performance.c -o test_hash_performance /home/rschiron/projects/sdb/test/unit/../../build-master/libsdb.a insert time: 0.004317 seconds find time: 0.413766 seconds, sum = 4738954741 insert time (2 time): 36.084946 seconds find time (2 time): 348.878303 seconds, sum = 5000114468030 Results on my branch:

BASEDIR=$(pwd)/../../build make test_hash_performance ; ./test_hash_performance
cc -I/home/rschiron/projects/sdb/test/../src -I/home/rschiron/projects/sdb/test/unit/../../build -g -O0 test_hash_performance.c -o test_hash_performance /home/rschiron/projects/sdb/test/unit/../../build/libsdb.a insert time: 0.013456 seconds find time: 0.175061 seconds, sum = 4738954741 insert time (2 time): 5.590546 seconds find time (2 time): 43.648521 seconds, sum = 5000114468030 As you can see, this PR makes it just so much better in everything. And this is not even a fair comparison since the ht in my PR will start with 3 elements and not with 1024 as it is in master (which is many time a waste of space, as one of the issue has shown)

— You are receiving this because you commented. Reply to this email directly, view it on GitHub, or mute the thread.

ret2libc commented 5 years ago

The only waste is u allocate tons of sdb. Which is not the case. Fixed size is fine for most use cases

Yes, that's a problem that I'm going to fix in the next PR, where I will use an approach similar to dict. But for now, I think this PR is already good as it is, since it definitely improves over the previous implementation (which already used sdblists)