Theft output suggests many duplicates created on big trial numbers.

ITheft says there are many duplicates when I test with big trial numbers. But the output seems to be misleading?

$ ./ip6dups 4>ips.6 5>hash.6                                         

== PROP 'test_ipv6_dups': 5000000 trials, seed 0x498fcff3b899ce7c
.......................................................................
.............................(PASS x 100)..............................
...................................................................d.. 
(PASS x 10000).ddddddddd(DUP x 10)ddd.dddddd(DUP x 100)d.ddd.ddddd     
(DUP x 1000)d.d.dd.dddd.d(DUP x 10000)d.d.d.dd.dddd(DUP x 100000)d.dddd
dddd(DUP x 1000000)dddd                                                
== PASS 'test_ipv6_dups': pass 136711, fail 0, skip 0, dup 4863289

I interpret the output like this. Theft created 5000000 test inputs. Then it called the test function only 136711 times. The other 4863289 inputs were not tested, because they were duplicates.

Is my understanding correct?

My test inputs are ipv6 addresses. Therefore I find it highly unlikely to get this many duplicates. Either the random number generator is bad, or the hash function used to check for duplicates is bad.

I printed the generated inputs and the hashes:

$ wc -l ips.6 hash.6               
  5000000 ips.6       
  5136711 hash.6 # hash_cb() is more often called then the test function?
$ sort ips.6|uniq |wc -l
5000000 # there are 5000000 different ips -> they're all unique.
$ sort hash.6|uniq |wc -l           
5000000 # there are 5000000 different hashes
$ sort hash.6|uniq -u|wc -l
4863289 # this many hashes appear only once

Why is the hash function called 5136711 times? If there are 5000000 different hashes, then the test should pass that many times, and have no duplicates!

I must be misinterpreting thefts output. And if that's true, the output is quiet misleading, IMHO.

I'm using theft 0.4.5.

Here is the code.

#include <theft.h>
#include <string.h>
#include <unistd.h>

#define FMT_ULONG 40

FILE *hash = 0;
FILE *ips = 0;

unsigned long trials = 5000000;

theft_hash ipv6_hash_cb(const void* instance, void* env);
unsigned int fmt_xlong(register char *s,register unsigned long u);
char tohex(char num);

enum theft_alloc_res
ipv6_unc_str_alloc_cb(struct theft *t, void *env, void **instance) {
    char *s = malloc(8*4+7+1);
    int l = 0;

    if (!s) return THEFT_ALLOC_ERROR;
    // make string without ::
    for (int i = 0; i<8; i++) {
        if (i) s[l++] = ':';

        int rand = theft_random_choice(t, 0x11110);
        int j = 4;
        char hex[4], pad = '0';
        while (rand >= 1 <<(4*j)) {
            s[l++] = '0';
            rand -= 1<<(4*j--);
        }

        if (fmt_xlong(hex, rand)>4) return THEFT_ALLOC_ERROR;
        while (j)
            s[l++] = hex[4-(j--)];
    }

    s[l++] = 0;
    *instance = s;

    if (ips) {
        fputs(s, ips);
        fputc('\n',ips);
    }
    return THEFT_ALLOC_OK;
}

void
print_ipv6(FILE *f, const void *instance, void *env) {
    fputs(instance, f);
}

static struct theft_type_info ipv6_str_info = {
    .alloc = ipv6_unc_str_alloc_cb,
    .free = theft_generic_free_cb,
    .print = print_ipv6,
    .hash = ipv6_hash_cb,
    .autoshrink_config = {
        .enable = true,
    },
};

static enum theft_trial_res
prop_dummy(struct theft *t, void *arg1) {
    return THEFT_TRIAL_PASS;
}

bool test_ipv6_dups(void) {
    /* Get a seed based on the current time */
    theft_seed seed = theft_seed_of_time();

    struct theft_run_config config = {
        .name = __func__,
        .prop1 = prop_dummy,
        .type_info = { &ipv6_str_info },
        .seed = seed,
        .trials = trials,
    };
    /* Run the property test. */
    enum theft_run_res res = theft_run(&config);
    return res == THEFT_RUN_PASS;
};

int main (int argc, char **argv) {

    hash = fopen("/proc/self/fd/5", "wb");
    ips = fopen("/proc/self/fd/4", "wb");

    int ret = !test_ipv6_dups();
    return ret;
}

theft_hash
ipv6_hash_cb(const void* instance, void* env) {
    size_t len = strlen(instance);
    struct theft_hasher h;
    theft_hash_init(&h);
    theft_hash_sink(&h, instance, len);
    theft_hash ret = theft_hash_done(&h);

    char buf[FMT_ULONG+1];
#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
    BUILD_BUG_ON(sizeof(theft_hash) != sizeof(long));

    if (hash) {
        fwrite(buf, fmt_xlong(buf, ret), 1, hash);
        fputc('\n', hash);
    }
    return ret;
}

unsigned int fmt_xlong(register char *s,register unsigned long u)
{
    register unsigned int len; register unsigned long q;
    len = 1; q = u;
    while (q > 15) { ++len; q /= 16; }
    if (s) {
        s += len;
        do { *--s = tohex(u % 16); u /= 16; } while(u); /* handles u == 0 */
    }
    return len;
}

char tohex(char num) {
    if (num < 10)
        return num + '0';
    else if (num < 16)
        return num - 10 + 'a';
    else
        return -1;
}

This looks like a problem with the blocked bloom filter configuration. There isn't currently a way to change that exposed in the interface, but try editing src/theft_run.c to provide larger sizes than the default configuration:

@@ -101,7 +101,11 @@ theft_run_init(const struct theft_run_config *cfg, struct theft **output) {
     /* If all arguments are hashable, then attempt to use
      * a bloom filter to avoid redundant checking. */
     if (all_hashable) {
-        t->bloom = theft_bloom_init(NULL);
+        struct theft_bloom_config cfg = {
+            .top_block_bits = 12,  /* 12 to 14 */
+            .min_filter_bits = 12,
+        };
+        t->bloom = theft_bloom_init(&cfg);
     }

With that setting I'm getting more reasonable results:

.top_block_bits == 12:
== PASS 'test_ipv6_dups': pass 4178707, fail 0, skip 0, dup 821293
== PASS 'test_ipv6_dups': pass 4179940, fail 0, skip 0, dup 820060

.top_block_bits == 14:
== PASS 'test_ipv6_dups': pass 4395986, fail 0, skip 0, dup 604014
== PASS 'test_ipv6_dups': pass 4396817, fail 0, skip 0, dup 603183

Why is the hash function called 5136711 times?

Rather than storing the argument hashes it recalculates them on demand, and it uses the hashes both to check if the arguments have already been run AND to mark them as run, in sufficiently different code paths that retaining them does not seem worthwhile. There shouldn't be any expectation that the arguments will be hashed exactly once per trial. It will hash lots of variants of the input while shrinking failures.

Also, with autoshrink enabled, you don't need to provide a hash function at all -- it will hash the bits consumed by theft_random_choice, which should work well as long as the autoshrink invariants are met. Commenting out .hash = ipv6_hash_cb, gets a very similar result.

It should probably check the trial count in the configuration, and use larger values than the default if the trial count is higher than a certain threshold -- 5 million trials is enough to saturate the bloom filter even with its automatic growing strategy, its settings were tuned with around 10k - 100k trials in mind. I can't promise I'll get to it soon, but I will probably cut a new patch release with that, because the current behavior is pretty broken with very large trial counts.

Thanks for the detailed report, it made reproducing this easy.

silentbicycle / theft

Theft output suggests many duplicates created on big trial numbers. #59