Closed cburgdorf closed 10 years ago
@kyjan here is the challenge. This is how a user looks like:
{
"user_id": 134633,
"user_type": "registered",
"creation_date": 1247015785,
"display_name": "caf",
"reputation": 107684,
"email_hash": "dc6be45ddcfd038ba5e1e5fca319c7fa",
"last_access_date": 1392553979,
"website_url": "",
"location": "Australia",
"about_me": "",
"question_count": 3,
"answer_count": 2619,
"view_count": 4496,
"up_vote_count": 5100,
"down_vote_count": 70,
"user_questions_url": "/users/134633/questions",
"user_answers_url": "/users/134633/answers",
"user_favorites_url": "/users/134633/favorites",
"user_tags_url": "/users/134633/tags",
"user_badges_url": "/users/134633/badges",
"user_timeline_url": "/users/134633/timeline",
"user_mentioned_url": "/users/134633/mentioned",
"user_comments_url": "/users/134633/comments",
"user_reputation_url": "/users/134633/reputation",
"badge_counts": {
"gold": 8,
"silver": 105,
"bronze": 230
},
"top_tags": [{
"tag_name": "c",
"question_score": 19,
"question_count": 1,
"answer_score": 6054,
"answer_count": 1735
}, {
"tag_name": "linux",
"question_score": 0,
"question_count": 0,
"answer_score": 2246,
"answer_count": 619
}, {
"tag_name": "c++",
"question_score": 0,
"question_count": 0,
"answer_score": 1870,
"answer_count": 317
}, {
"tag_name": "linux-kernel",
"question_score": 0,
"question_count": 0,
"answer_score": 496,
"answer_count": 191
}, {
"tag_name": "java",
"question_score": 0,
"question_count": 0,
"answer_score": 493,
"answer_count": 52
}, {
"tag_name": "performance",
"question_score": 0,
"question_count": 0,
"answer_score": 490,
"answer_count": 38
}, {
"tag_name": "optimization",
"question_score": 0,
"question_count": 0,
"answer_score": 482,
"answer_count": 33
}, {
"tag_name": "sockets",
"question_score": 0,
"question_count": 0,
"answer_score": 455,
"answer_count": 161
}, {
"tag_name": "pthreads",
"question_score": 0,
"question_count": 0,
"answer_score": 433,
"answer_count": 151
}, {
"tag_name": "pointers",
"question_score": 19,
"question_count": 1,
"answer_score": 431,
"answer_count": 110
}, {
"tag_name": "gcc",
"question_score": 0,
"question_count": 0,
"answer_score": 414,
"answer_count": 88
}, {
"tag_name": "branch-prediction",
"question_score": 0,
"question_count": 0,
"answer_score": 383,
"answer_count": 1
}, {
"tag_name": "openssl",
"question_score": 0,
"question_count": 0,
"answer_score": 327,
"answer_count": 100
}, {
"tag_name": "encryption",
"question_score": 0,
"question_count": 0,
"answer_score": 322,
"answer_count": 110
}, {
"tag_name": "algorithm",
"question_score": 0,
"question_count": 0,
"answer_score": 283,
"answer_count": 35
}, {
"tag_name": "arrays",
"question_score": 0,
"question_count": 0,
"answer_score": 278,
"answer_count": 69
}, {
"tag_name": "networking",
"question_score": 6,
"question_count": 1,
"answer_score": 278,
"answer_count": 106
}, {
"tag_name": "unix",
"question_score": 2,
"question_count": 1,
"answer_score": 263,
"answer_count": 74
}, {
"tag_name": "multithreading",
"question_score": 0,
"question_count": 0,
"answer_score": 246,
"answer_count": 89
}, {
"tag_name": "tcp",
"question_score": 0,
"question_count": 0,
"answer_score": 191,
"answer_count": 94
}, {
"tag_name": "memory",
"question_score": 0,
"question_count": 0,
"answer_score": 182,
"answer_count": 41
}, {
"tag_name": "security",
"question_score": 0,
"question_count": 0,
"answer_score": 177,
"answer_count": 41
}, {
"tag_name": "memory-management",
"question_score": 0,
"question_count": 0,
"answer_score": 176,
"answer_count": 43
}, {
"tag_name": "posix",
"question_score": 0,
"question_count": 0,
"answer_score": 158,
"answer_count": 53
}, {
"tag_name": "assembly",
"question_score": 0,
"question_count": 0,
"answer_score": 155,
"answer_count": 41
}, {
"tag_name": "signals",
"question_score": 0,
"question_count": 0,
"answer_score": 152,
"answer_count": 37
}, {
"tag_name": "string",
"question_score": 0,
"question_count": 0,
"answer_score": 148,
"answer_count": 49
}, {
"tag_name": "cryptography",
"question_score": 0,
"question_count": 0,
"answer_score": 133,
"answer_count": 55
}, {
"tag_name": "php",
"question_score": 0,
"question_count": 0,
"answer_score": 125,
"answer_count": 31
}, {
"tag_name": "x86",
"question_score": 0,
"question_count": 0,
"answer_score": 123,
"answer_count": 19
}]
}
We want to perform different kind of searches:
location
(with wildcard e.g Han* for Hannover)location
and against the existence of a tag_name
that matches a given string (with wildcard e.g. Ang* for AngularJS)reputation
oranswer_score
of the matched tag (thats the difficulty right now)Are you familiar with such kind of queries?
@cburgdorf this is an easy lift for elasticsearch :) I think we can archive this goal realy fast
@kyjan I booked at found.no and started playing with it. Would love to pair with you on that. Are you in the office tomorrow or on Monday? I plan to work from home on Friday.
@cburgdorf i will join you in the office in a bunch of minutes since my prof is just reading the script for us ;)
Paired with @kyjan on this today and we came up with this epic query.
{
"sort": [{
"score": {
"order": "desc",
"mode": "sum",
"nested_path": "tags",
"nested_filter": {
"bool": {
"should": [{
"prefix": {
"tag_name": "node"
}
}, {
"prefix": {
"tag_name": "ang"
}
}]
}
}
}
}],
"query": {
"bool": {
"must": [{
"bool": {
"should": [{
"wildcard": {
"location": "han*"
}
}, {
"wildcard": {
"location": "ber*"
}
}],
"minimum_should_match": 1
}
}, {
"nested": {
"path": "tags",
"query": {
"bool": {
"should": [{
"prefix": {
"tags.tag_name": "ph"
}
}, {
"prefix": {
"tags.tag_name": "ang"
}
}]
}
}
}
}]
}
}
}
This is the mapping needed for that to work:
{
"user": {
"properties": {
"location": {
"type": "string"
},
"tags": {
"type": "nested",
"properties": {
"tag_name": {
"type": "string"
},
"score": {
"type": "integer"
}
}
}
}
}
}
I checked the stats yesterday. The postgres DB is roughly at 2 GB already. But that's not an issue since it is only restricted to row numbers and we still have plenty of room until we hit 10 millions of records :)
I roughly calculated the expected elasticsearch size and we would also hit 2 GB quite early (surprise, surprise). So for now, I guess we should keep syncing with postgres and then update the elasticsearch index from their.
sync in progress :)
all users synced! And the queries seem to run quite fast! So looking forward to this change!
We want to build more sophisticated search / sorting feature and I feel that this might be challenging with postgres.
This looks promising:
http://docs.fullscale.co/elasticjs/
The thing is, we can keep the current synchronization. This works rock solid and I'm not willing to invest time into yet another rewrite. We can just keep syncing the way we do it now and then just update ElasticSearch with our ready to use data. This should be straight forward. I'm sure @kyjan can help here :)