ninya-io / ninya.io

Find StackOverflow users near you by tags and reputation
MIT License
60 stars 13 forks source link

Try ElasticSearch #24

Closed cburgdorf closed 10 years ago

cburgdorf commented 10 years ago

We want to build more sophisticated search / sorting feature and I feel that this might be challenging with postgres.

This looks promising:

http://docs.fullscale.co/elasticjs/

The thing is, we can keep the current synchronization. This works rock solid and I'm not willing to invest time into yet another rewrite. We can just keep syncing the way we do it now and then just update ElasticSearch with our ready to use data. This should be straight forward. I'm sure @kyjan can help here :)

cburgdorf commented 10 years ago

@kyjan here is the challenge. This is how a user looks like:

{
    "user_id": 134633,
    "user_type": "registered",
    "creation_date": 1247015785,
    "display_name": "caf",
    "reputation": 107684,
    "email_hash": "dc6be45ddcfd038ba5e1e5fca319c7fa",
    "last_access_date": 1392553979,
    "website_url": "",
    "location": "Australia",
    "about_me": "",
    "question_count": 3,
    "answer_count": 2619,
    "view_count": 4496,
    "up_vote_count": 5100,
    "down_vote_count": 70,
    "user_questions_url": "/users/134633/questions",
    "user_answers_url": "/users/134633/answers",
    "user_favorites_url": "/users/134633/favorites",
    "user_tags_url": "/users/134633/tags",
    "user_badges_url": "/users/134633/badges",
    "user_timeline_url": "/users/134633/timeline",
    "user_mentioned_url": "/users/134633/mentioned",
    "user_comments_url": "/users/134633/comments",
    "user_reputation_url": "/users/134633/reputation",
    "badge_counts": {
        "gold": 8,
        "silver": 105,
        "bronze": 230
    },
    "top_tags": [{
        "tag_name": "c",
        "question_score": 19,
        "question_count": 1,
        "answer_score": 6054,
        "answer_count": 1735
    }, {
        "tag_name": "linux",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 2246,
        "answer_count": 619
    }, {
        "tag_name": "c++",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 1870,
        "answer_count": 317
    }, {
        "tag_name": "linux-kernel",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 496,
        "answer_count": 191
    }, {
        "tag_name": "java",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 493,
        "answer_count": 52
    }, {
        "tag_name": "performance",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 490,
        "answer_count": 38
    }, {
        "tag_name": "optimization",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 482,
        "answer_count": 33
    }, {
        "tag_name": "sockets",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 455,
        "answer_count": 161
    }, {
        "tag_name": "pthreads",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 433,
        "answer_count": 151
    }, {
        "tag_name": "pointers",
        "question_score": 19,
        "question_count": 1,
        "answer_score": 431,
        "answer_count": 110
    }, {
        "tag_name": "gcc",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 414,
        "answer_count": 88
    }, {
        "tag_name": "branch-prediction",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 383,
        "answer_count": 1
    }, {
        "tag_name": "openssl",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 327,
        "answer_count": 100
    }, {
        "tag_name": "encryption",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 322,
        "answer_count": 110
    }, {
        "tag_name": "algorithm",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 283,
        "answer_count": 35
    }, {
        "tag_name": "arrays",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 278,
        "answer_count": 69
    }, {
        "tag_name": "networking",
        "question_score": 6,
        "question_count": 1,
        "answer_score": 278,
        "answer_count": 106
    }, {
        "tag_name": "unix",
        "question_score": 2,
        "question_count": 1,
        "answer_score": 263,
        "answer_count": 74
    }, {
        "tag_name": "multithreading",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 246,
        "answer_count": 89
    }, {
        "tag_name": "tcp",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 191,
        "answer_count": 94
    }, {
        "tag_name": "memory",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 182,
        "answer_count": 41
    }, {
        "tag_name": "security",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 177,
        "answer_count": 41
    }, {
        "tag_name": "memory-management",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 176,
        "answer_count": 43
    }, {
        "tag_name": "posix",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 158,
        "answer_count": 53
    }, {
        "tag_name": "assembly",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 155,
        "answer_count": 41
    }, {
        "tag_name": "signals",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 152,
        "answer_count": 37
    }, {
        "tag_name": "string",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 148,
        "answer_count": 49
    }, {
        "tag_name": "cryptography",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 133,
        "answer_count": 55
    }, {
        "tag_name": "php",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 125,
        "answer_count": 31
    }, {
        "tag_name": "x86",
        "question_score": 0,
        "question_count": 0,
        "answer_score": 123,
        "answer_count": 19
    }]
}

We want to perform different kind of searches:

  1. Only match against location (with wildcard e.g Han* for Hannover)
  2. Match against location and against the existence of a tag_name that matches a given string (with wildcard e.g. Ang* for AngularJS)
  3. For the queries above, I would like to be able to either rank by reputation or
  4. rank by the answer_score of the matched tag (thats the difficulty right now)

Are you familiar with such kind of queries?

janpantel commented 10 years ago

@cburgdorf this is an easy lift for elasticsearch :) I think we can archive this goal realy fast

cburgdorf commented 10 years ago

@kyjan I booked at found.no and started playing with it. Would love to pair with you on that. Are you in the office tomorrow or on Monday? I plan to work from home on Friday.

janpantel commented 10 years ago

@cburgdorf i will join you in the office in a bunch of minutes since my prof is just reading the script for us ;)

cburgdorf commented 10 years ago

Paired with @kyjan on this today and we came up with this epic query.

{
    "sort": [{
        "score": {
            "order": "desc",
            "mode": "sum",
            "nested_path": "tags",
            "nested_filter": {
                "bool": {
                    "should": [{
                        "prefix": {
                            "tag_name": "node"
                        }
                    }, {
                        "prefix": {
                            "tag_name": "ang"
                        }
                    }]
                }
            }
        }
    }],
    "query": {
        "bool": {
            "must": [{
                "bool": {
                    "should": [{
                        "wildcard": {
                            "location": "han*"
                        }
                    }, {
                        "wildcard": {
                            "location": "ber*"
                        }
                    }],
                    "minimum_should_match": 1
                }
            }, {
                "nested": {
                    "path": "tags",
                    "query": {
                        "bool": {
                            "should": [{
                                "prefix": {
                                    "tags.tag_name": "ph"
                                }
                            }, {
                                "prefix": {
                                    "tags.tag_name": "ang"
                                }
                            }]
                        }
                    }
                }
            }]
        }
    }
}

This is the mapping needed for that to work:

{
    "user": {
        "properties": {
            "location": {
                "type": "string"
            },
            "tags": {
                "type": "nested",
                "properties": {
                    "tag_name": {
                        "type": "string"
                    },
                    "score": {
                        "type": "integer"
                    }
                }
            }
        }
    }
}
cburgdorf commented 10 years ago

I checked the stats yesterday. The postgres DB is roughly at 2 GB already. But that's not an issue since it is only restricted to row numbers and we still have plenty of room until we hit 10 millions of records :)

I roughly calculated the expected elasticsearch size and we would also hit 2 GB quite early (surprise, surprise). So for now, I guess we should keep syncing with postgres and then update the elasticsearch index from their.

cburgdorf commented 10 years ago

sync in progress :)

cburgdorf commented 10 years ago

all users synced! And the queries seem to run quite fast! So looking forward to this change!