HashtagSell / posting-api

API for storage and retrieval of posting details for Hashtagsell
0 stars 0 forks source link

De-dupe search results #31

Open that1guy opened 9 years ago

that1guy commented 9 years ago

I have code that will de-dupe records returned from mongo.find. This should probably be included somewhere in the waterfall block before returning results. Give me your suggestion on how to implement and I will do this on separate branch and send pull request.

that1guy commented 9 years ago

FYI, I can do this on client-side if that's helpful. Just say the word so we're not duplicating efforts.

that1guy commented 9 years ago

Cleaning up code on my end.. before I delete my de-dupe code here it is.

var HashTable = require('hashtable');

exports.dedupe = function(result, promise){

    var userLat = result.location.latitude;

    var userLong = result.location.longitude;

    var response = result.external;

    var deDupeExternalID = new HashTable();

    var deDupeHeading = new HashTable();

    var duplicates = [];

    var originals = [];

    deDupeExternalID.put(response.postings[0].external_url, 0);
    //console.log("ID is unique: " + response.postings[0].external_id);
    deDupeHeading.put(response.postings[0].heading, 0);
    //console.log("Heading is unique: " + response.postings[0].heading);

    for (var i = 1; i < response.postings.length; i++) {

        result = response.postings[i];

        if(typeof deDupeExternalID.get(result.external_url) === 'undefined'){
            //console.log("URL is unique: " + result.external_url);
            deDupeExternalID.put(result.external_url, i);

            if(typeof deDupeHeading.get(result.heading) === 'undefined'){
                //console.log("Heading is unique: "+ result.heading);
                deDupeHeading.put(result.heading, i);

//              TODO: Clean up HTML
                originals.push(convertToHTSObjStructure(result, userLat, userLong));

            } else {

                duplicates.push(result);
                console.log("Duplicate Heading: "+ result.heading);
            }
        } else {
            duplicates.push(result);
            console.log("Duplicate URL: "+result.external_url);
        }
    }

    console.log("!!!!!!!!!~~~~ DONE WITH DEDUPE ~~~~!!!!!!!!!");
    console.log(duplicates.length + " Duplicates");
    console.log(originals.length + " Originals");
    console.log("!!!!!!!!!~~~~ DONE WITH DEDUPE ~~~~!!!!!!!!!");

    promise(null, originals);

};
that1guy commented 9 years ago

This is becoming our only weak spot in search that I can see. knock this down and I think we're golden.

https://staging-posting-api.hashtagsell.com/v1/postings/?start=0&count=35&filters[mandatory][contains][heading]=htc&filters[optional][exact][categoryCode]=SELE,SAPL&geo[lookup]=true&geo[min]=0&geo[max]=12890000