The following jaccard similarity check query finds records that have the same
unordered list fields by giving threshold 1.0f, but the returned results are
not correct.
------------------------------
drop dataverse test if exists;
create dataverse test;
use dataverse test;
create type TwitterUserType as closed {
screen-name: string,
lang: string,
friends-count: int32,
statuses-count: int32,
name: string,
followers-count: int32
}
create type TweetMessageType as closed {
tweetid: int64,
user: TwitterUserType,
sender-location: point,
send-time: datetime,
referred-topics: {{ string }},
interests: [string],
message-text: string,
countA: int32,
countB: int32
}
create dataset TweetMessages(TweetMessageType)
primary key tweetid;
use dataverse test;
for $t1 in dataset('TweetMessages')
for $t2 in dataset('TweetMessages')
let $sim := similarity-jaccard-check($t1.referred-topics, $t2.referred-topics,
1.0f)
where $sim[0] and $t1.tweetid != $t2.tweetid and $t1.tweetid < int64("10")
return { "msg1":$t1.message-text, "msg2":$t2.message-text}
result:
{ "msg1": " dislike samsung the speed is OMG", "msg2": " hate samsung the speed
is horrible:(" }
{ "msg1": " love sprint its voice-clarity is amazing", "msg2": " can't stand
sprint the voice-clarity is bad" }
{ "msg1": " hate samsung the 3G is horrible", "msg2": " like samsung the 3G is
mind-blowing:)" }
{ "msg1": " hate motorola its 3G is OMG:(", "msg2": " can't stand motorola the
3G is OMG:(" }
{ "msg1": " hate sprint the wireless is terrible:(", "msg2": " like sprint its
wireless is mind-blowing:)" }
{ "msg1": " love t-mobile the speed is mind-blowing:)", "msg2": " love t-mobile
the speed is mind-blowing:)" }
{ "msg1": " hate iphone the shortcut-menu is bad", "msg2": " love iphone the
shortcut-menu is mind-blowing" }
{ "msg1": " dislike samsung the speed is OMG", "msg2": " hate samsung its speed
is horrible" }
{ "msg1": " like motorola its voice-command is amazing", "msg2": " hate
motorola its voice-command is OMG" }
{ "msg1": " love sprint its voice-clarity is amazing", "msg2": " like sprint
the voice-clarity is good:)" }
{ "msg1": " hate samsung the 3G is horrible", "msg2": " dislike samsung its 3G
is bad:(" }
{ "msg1": " dislike samsung the speed is OMG", "msg2": " like samsung the speed
is mind-blowing" }
{ "msg1": " love sprint its voice-clarity is amazing", "msg2": " can't stand
sprint the voice-clarity is terrible" }
{ "msg1": " hate iphone the shortcut-menu is bad", "msg2": " like iphone the
shortcut-menu is mind-blowing" }
{ "msg1": " hate iphone the shortcut-menu is bad", "msg2": " like iphone its
shortcut-menu is amazing:)" }
{ "msg1": " hate verizon the 3G is OMG:(", "msg2": " love verizon its 3G is
mind-blowing:)" }
{ "msg1": " hate samsung the 3G is horrible", "msg2": " dislike samsung its 3G
is bad" }
{ "msg1": " hate motorola its 3G is OMG:(", "msg2": " dislike motorola its 3G
is horrible:(" }
{ "msg1": " hate sprint the wireless is terrible:(", "msg2": " dislike sprint
the wireless is OMG:(" }
{ "msg1": " love t-mobile the speed is mind-blowing:)", "msg2": " love t-mobile
its speed is good:)" }
{ "msg1": " hate iphone the shortcut-menu is bad", "msg2": " like iphone its
shortcut-menu is good" }
{ "msg1": " hate motorola its 3G is OMG:(", "msg2": " hate motorola the 3G is
OMG:(" }
{ "msg1": " hate motorola its 3G is OMG:(", "msg2": " like motorola its 3G is
good:)" }
{ "msg1": " dislike samsung the speed is OMG", "msg2": " can't stand samsung
the speed is terrible:(" }
Original issue reported on code.google.com by kiss...@gmail.com on 18 May 2014 at 2:58
Original issue reported on code.google.com by
kiss...@gmail.com
on 18 May 2014 at 2:58Attachments: