greg7mdp / parallel-hashmap

A family of header-only, very fast and memory-friendly hashmap and btree containers.
https://greg7mdp.github.io/parallel-hashmap/
Apache License 2.0
2.47k stars 234 forks source link

custom hasher in initializer constructor with arguments for parallel_hash_map #246

Closed gf777 closed 2 weeks ago

gf777 commented 1 month ago

Hi @greg7mdp I was looking here https://github.com/greg7mdp/parallel-hashmap/issues/125 And I thought I could use the same syntax for parallel_hash_map but it doesn't work:

  struct KeyHasher {

        uint64_t* pows = new uint64_t[kPrefixLen];

        KeyHasher() {
            for(uint8_t p = 0; p<kPrefixLen; ++p) // precomputes the powers of k
                pows[p] = (uint64_t) pow(4,p);
        }

        std::size_t operator()(const Key& key) const {

            uint8_t *kmerPtr = seqBuf->seq+key.getKmer();
            uint64_t fw = 0, rv = 0; // hashes for both forward and reverse complement sequence

            for(uint8_t c = 0; c<kPrefixLen; ++c) { // for each position up to kPrefixLen
                fw += *(kmerPtr+c) * pows[c]; // base * 2^N
                rv += (3-(*(kmerPtr+kLen-1-c))) * pows[c]; // we walk the kmer backward to compute the rvcp
            }

            // return fw < rv ? 0 : 0; // even if they end up in the same bucket it's fine!
            return fw < rv ? fw : rv;
        }
    };

    struct KeyEqualTo {

        Buf<uint8_t> *seqBuf, *seqBuf2;

        KeyEqualTo(Buf<uint8_t> *seqBuf = NULL, Buf<uint8_t> *seqBuf2 = NULL) : seqBuf(seqBuf), seqBuf2(seqBuf2) {}

        constexpr bool operator()(const Key& key1, const Key& key2) const {

            uint8_t *lhs = seqBuf->seq+key1.getKmer(), *rhs = seqBuf2->seq+key2.getKmer();

            for(uint32_t i = 0; i<kLen; ++i) { // check fw
                if(lhs[i] != rhs[i])
                    break;
                if (i == kLen-1)
                    return true;
            }
            for(uint32_t i = 0; i<kLen; ++i) { // if fw fails, check rv
                if(lhs[i] != 3-rhs[kLen-i-1])
                    return false;
            }
            return true;
        }
    };
`Kmap(UserInput& userInput) : userInput(userInput), k{kPrefixLen}, maps(0, KeyHasher(), KeyEqualTo(seqBuf, seqBuf2)), maps32(0, KeyHasher(), KeyEqualTo(seqBuf, seqBuf2))`

What is the right syntax? Thanks!!

gf777 commented 2 weeks ago

Actually, I realize that the syntax is the same:

maps[m] = new ParallelMap(0, KeyHasher(seqBuf[m].seq, prefix, k), KeyEqualTo(seqBuf[m].seq, prefix, k));

And:

struct KeyHasher {

    uint8_t prefix;
    uint32_t k;
    uint64_t* pows;
    Buf2bit<> *seqBuf;

    KeyHasher() {}

    KeyHasher(Buf2bit<> *seqBuf, uint8_t prefix, uint32_t k) : prefix(prefix), k(k), seqBuf(seqBuf) {
        pows = new uint64_t[prefix];
        for(uint8_t p = 0; p<prefix; ++p) // precomputes the powers of k
            pows[p] = (uint64_t) pow(4,p);
    }
    std::size_t operator()(const Key& key) const {

        uint64_t fw = 0, rv = 0, offset = key.getKmer(); // hashes for both forward and reverse complement sequence

        for(uint8_t c = 0; c<prefix; ++c) { // for each position up to prefix len

            fw += seqBuf->at(offset+c) * pows[c]; // base * 2^N
            rv += (3-seqBuf->at(offset+k-1-c)) * pows[c]; // we walk the kmer backward to compute the rvcp
        }
        // return fw < rv ? 0 : 0; // even if they end up in the same bucket it's fine!
        return fw < rv ? fw : rv;
    }
};
greg7mdp commented 2 weeks ago

@gf777 Glad you found your answer, sorry I wasn't more helpful, I'm very busy these days.