ubilabs / kd-tree-javascript

JavaScript k-d Tree Implementation
MIT License
642 stars 109 forks source link

Distance metrics such as cosine similarity don't work #33

Open TheColorman opened 1 year ago

TheColorman commented 1 year ago

When using a metric such as Cosine similarity, the returned vector is almost never the one with the smallest distance value. Example:

Dataset ```javascript var points = [ { 'x': 61, 'y': 54, 'id': 'A' }, { 'x': 7, 'y': 54, 'id': 'B' }, { 'x': 62, 'y': 58, 'id': 'C' }, { 'x': 75, 'y': 48, 'id': 'D' }, { 'x': 76, 'y': 52, 'id': 'E' }, { 'x': 41, 'y': 16, 'id': 'F' }, { 'x': 35, 'y': 35, 'id': 'G' }, { 'x': 34, 'y': 63, 'id': 'H' }, { 'x': 22, 'y': 32, 'id': 'I' }, { 'x': 0, 'y': 53, 'id': 'J' }, { 'x': 13, 'y': 91, 'id': 'K' }, { 'x': 87, 'y': 24, 'id': 'L' }, { 'x': 63, 'y': 2, 'id': 'M' }, { 'x': 10, 'y': 76, 'id': 'N' }, { 'x': 85, 'y': 98, 'id': 'O' }, { 'x': 94, 'y': 90, 'id': 'P' }, { 'x': 52, 'y': 47, 'id': 'Q' }, { 'x': 100, 'y': 38, 'id': 'R' }, { 'x': 95, 'y': 47, 'id': 'S' }, { 'x': 62, 'y': 15, 'id': 'T' }, { 'x': 93, 'y': 66, 'id': 'U' }, { 'x': 46, 'y': 54, 'id': 'V' }, { 'x': 85, 'y': 99, 'id': 'W' }, { 'x': 32, 'y': 53, 'id': 'X' }, { 'x': 11, 'y': 37, 'id': 'Y' }, { 'x': 0, 'y': 54, 'id': 'Z' }, { 'x': 90, 'y': 100, 'id': 's' }, { 'x': 84, 'y': 58, 'id': 't' }, { 'x': 97, 'y': 35, 'id': 'u' }, { 'x': 24, 'y': 34, 'id': 'v' }, { 'x': 67, 'y': 70, 'id': 'w' }, { 'x': 16, 'y': 7, 'id': 'x' }, { 'x': 27, 'y': 73, 'id': 'a' }, { 'x': 0, 'y': 35, 'id': 'b' }, { 'x': 97, 'y': 47, 'id': 'c' }, { 'x': 44, 'y': 56, 'id': 'd' }, { 'x': 23, 'y': 11, 'id': 'e' }, { 'x': 3, 'y': 80, 'id': 'f' }, { 'x': 87, 'y': 27, 'id': 'g' }, { 'x': 42, 'y': 29, 'id': 'h' }, { 'x': 77, 'y': 72, 'id': 'i' }, { 'x': 40, 'y': 10, 'id': 'j' }, { 'x': 86, 'y': 91, 'id': 'k' }, { 'x': 43, 'y': 23, 'id': 'l' }, { 'x': 55, 'y': 13, 'id': 'm' }, { 'x': 88, 'y': 14, 'id': 'n' }, { 'x': 67, 'y': 22, 'id': 'o' }, { 'x': 88, 'y': 91, 'id': 'p' }, { 'x': 82, 'y': 33, 'id': 'q' }, { 'x': 97, 'y': 29, 'id': 'r' } ] ```

Cosine similarity (note the negative sign at the end of the function, as it would normally increase with similar vectors):

    const cosineSimilarity = (vec1, vec2) => {
        let dotProduct = 0;
        let magnitude1 = 0;
        let magnitude2 = 0;

        for (const dim of Object.keys(vec1)) {
            if (typeof (vec1[dim]) != 'number') continue;

            dotProduct += vec1[dim] * vec2[dim];
            magnitude1 += Math.pow(vec1[dim], 2);
            magnitude2 += Math.pow(vec2[dim], 2);
        }

        return -dotProduct / (Math.sqrt(magnitude1) * Math.sqrt(magnitude2));
    };

Inefficient function the get the closest vectors from an array.

    const closestVectors = (vec, vectors) => {
        const distances = vectors.map(v => cosineSimilarity(vec, v));
        const min = Math.min(...distances);
        // Check if multiple with same distance
        const minIndices = distances.reduce((a, e, i) => {
            if (e === min) a.push(i);
            return a;
        }, []);
        // Return list of closest vectors
        return { vectors: minIndices.map(i => vectors[i]), distances: minIndices.map(i => distances[i]) };
    };

Testing

    const test = { x: 200, y: 123, label: "TEST" }
    var tree = new kdTree(points, cosineSimilarity, ['x', 'y']);

    const nearest = tree.nearest(test, 1);
    const nearest2 = closestVectors(test, points);

    console.log("K-D Tree")
    console.log(nearest)
    console.log("Custom")
    console.log(nearest2)

  // > K-D Tree
  // > [ [ { x: 76, y: 52, id: 'E' }, -0.998815642613221 ] ]
  // > Custom
  // > {
  //     vectors: [ { x: 75, y: 48, id: 'D' } ],
  //     distances: [ -0.999839132267399 ]
  //   }

Clearly the tree returns a vector with a slightly larger distance (0.998 vs 0.999).