# then normalize by the number of unique k-mers (to get the containment index)
# In essence, this is the containment index, restricted to unique k-mers. This effectively increases the specificity,
# but also increases the variance/confidence interval, since this decreases the size of the sketch.
for k_size_loc in range(len(k_range)):
k_size = k_range[k_size_loc]
for hash_loc in np.where(containment_indices[:, k_size_loc])[0]: # find the genomes with non-zero containment
unique_kmers = set()
for kmer in CEs[hash_loc]._kmers:
unique_kmers.add(kmer[:k_size]) # find the unique k-mers
#containment_indices[hash_loc, k_size_loc] /= float(len(unique_kmers)) # FIXME: this doesn't seem like the right way to normalize, but apparently it is!
containment_indices[hash_loc, k_size_loc] /= float(num_unique[hash_loc, k_size_loc]) # FIXME: in small tests, this seems to give better results. To be revisted.
which denominator in the last two lines is correct?
Specifically:
which denominator in the last two lines is correct?