Scalable MinHash computation
npm i @5app/lsh
const Lsh = require('@5app/lsh')
const B = 10;
const R = 5;
class MyDataLsh extends Lsh { constructor (bands = B, height = R) { // set default permutation params super(bands, height) }
async getColumnIdSlice ({ cursorId, size, ...custom }) { // return a number {size} of ids from cursorId }
async getRowIdSlice ({ cursorId, size, ...custom }) { // return a number {size} of ids from cursorId }
async getRowCount ({...custom }) { // return total numbers of rows }
async getShingles ({ columnIds, rowIds, ...custom }) { // return Shingles for specified columns and rows }
async store ({ index, buckets, data, ...custom }) { // store a batch of minhashes and bucket info // use data object to store in memory }
async finalise ({ blocks, columns, rows, stamp, data }) { // ... finalise info lsh storage // return report object }
static get limit () { // return permutation limit }
static signature(value, index) { // return stringified value }
static ignore (bucketId) { // return whether this bucket is null }
static format (bucketId, index) { // return formated bucketId to append to minhash } }
module.exports = MyDataLsh
2. Compute and compare your minhashes
```javascript
const MyDataLsh = require('./myDataLsh')
const { compare, getItemMinHash } = require('./myMethods')
const myDataLsh = new MyDataLsh(10, 10)
// ...
// compute and store your items minhash
const size = 25 // size of blocks to be computed
const report = await myDataLsh.run(custom, size)
// ...
// compare your items minhash
const [ minHashA, minHashB ] = await Promise.all([
getItemMinHash(itemA.id),
getItemMinHash(itemB.id)
])
const similarity = compare(minHashA, minHashB)
// ...
npm test
We use SemVer for versioning. For the versions available, see the tags on this repository.