How to use item_exclude

Hi!

Thanks for a really impressive package.

In my world there are two common scenarios when building recommendation systems. You either want to recommend products that a customer has never liked (or bought) from your whole catalogue or you want to recommend products from a subset of the catalogue, e.g. products that are discounted. Most implementations of collaborative filtering focus on the first scenario. My question is how to use the item_exclude to tackle the second scenario. This is somewhat related to a previous issue

For instance, say that we have 60 artists whose album are on sale in the lastfm dataset that we want to recommend.

Example code from: http://dsnotes.com/post/2017-06-28-matrix-factorization-for-recommender-systems-part-2/

set.seed(1)
library(data.table)
raw_data = fread("lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv",
                 showProgress = FALSE, encoding = "UTF-8",
                 quote = "")
setnames(raw_data, c("user_id", "artist_id", "artist_name", "number_plays"))

user_encoding <- raw_data[, .(uid = .GRP), keyby = user_id]

item_encoding = raw_data[, .(iid = .GRP, artist_name = artist_name[[1]]), keyby = artist_id]

Here I'll sample 60 artists "on sale" and create a table of items to exclude from the predictions.

on_sale <- sample(item_encoding$artist_name, 60)
items_exclude <- item_encoding[!(artist_name %in% on_sale)]
on_sale
 [1] "the bridge"                          "snippet"                            
 [3] "v.o.s."                              "the ullulators"                     
 [5] "藤井フミヤ"                          "erika jo"                           
 [7] "gore"                                "amaral"                             
 [9] "ceili rain"                          "schwarze puppen"                    
[11] "dan wheeler"                         "yuki suzuki"                        
[13] "krymplings"                          "olivia ruiz"                        
[15] "edgewater"                           "karl johan"                         
[17] "pamela z"                            "global spirit"                      
[19] "damien youth"                        "fires of babylon"                   
[21] "comic relief"                        "emmanuel horvilleur"                
[23] "sandra stephens"                     "cyclopede"                          
[25] "Михаил Боярский"                     "the great eastern"                  
[27] "radwimps"                            "papa austin with the great peso"    
[29] "phasen"                              "mari menari"                        
[31] "Холодне Сонце"                       "laura story"                        
[33] "mugwart"                             "errand boy"                         
[35] "erlend krauser"                      "göran fristorp"                     
[37] "mousse t & emma lanford"             "dj vlad & dirty harry"              
[39] "denim"                               "thomas leer & robert rental"        
[41] "the underdog project vs the sunclub" "sense club"                         
[43] "mary kiani"                          "ladies night"                       
[45] "tresk"                               "the peddlers"                       
[47] "quatuor ysaÿe"                       "brandhärd"                          
[49] "bittor aiape"                        "prince francis"                     
[51] "alex klaasen & martine sandifort"    "peppermint petty"                   
[53] "dave ramsey"                         "müşfik kenter"                      
[55] "shima & shikou duo"                  "jimmy j & cru-l-t"                  
[57] "ankarali yasemin"                    "marian opania"                      
[59] "madita"                              "zoltar"

Below are some data manipulation to put data in a sparse matrix.

library(Matrix)
raw_data[, artist_name := NULL]
dt = user_encoding[raw_data, .(artist_id, uid, number_plays), on = .(user_id = user_id)]
dt = item_encoding[dt, .(iid, uid, number_plays), on = .(artist_id = artist_id)]
rm(raw_data)

X = sparseMatrix(i = dt$uid, j = dt$iid, x = dt$number_plays, 
                 dimnames = list(user_encoding$user_id, item_encoding$artist_name))
N_CV = 1000L
cv_uid = sample(nrow(user_encoding), N_CV)

X_train = X[-cv_uid, ]
X_cv = X[cv_uid, ]
rm(X)

Here we fit the model.

make_confidence = function(x, alpha) {
  x_confidence = x
  stopifnot(inherits(x, "sparseMatrix"))
  x_confidence@x = 1 + alpha * x@x
  x_confidence
}
library(rsparse)
model = WRMF$new(x_train = x_train, x_cv = X_cv, rank = 8, feedback = "implicit")
set.seed(1)
alpha = 0.01
X_train_conf = make_confidence(X_train, alpha)
X_cv_history_conf = make_confidence(X_cv_history, alpha)
user_embeddings = model$fit_transform(X_train_conf, n_iter = 10L, n_threads = 8)
new_user_embeddings = model$transform(X_cv_history_conf)

Now, I want to recommend only the artists that are on sale, so I pass the excluded artists to the items_exclude argument.

new_user_1 = X_cv[1:1, , drop = FALSE]
new_user_predictions = model$predict(new_user_1, k = 60, items_exclude = items_exclude$artist_name)

head(data.frame(segmentid = t(attr(new_user_predictions, "ids"))))
  e9dc15dfabe0bdac615143623e1fe83ba4e2daa5
1                                   bjÃ¶rk
2                  einstÃ¼rzende neubauten
3                                     isis
4                        frÃ©dÃ©ric chopin
5                               sigur rÃ³s
6                        ë\u008f™ë°©ì‹ ê¸°

However, these recommendations are not the ones on sale?

I suppose this would be clearer for me with a vignette, that I can see is on its way, however, in the meanwhile, how should one use the item_exclude argument?

Furthermore, say we want to maximize the recommendations here, i.e. put k = 60, would that work for multiple users?

So I figured this out. It is pretty straight forward. Since we use the encoded item-variable iid in the item_encoding data.frame when modelling the items_exclude vector needs to contain iid.

For example I wanted only these artists to be recommended:

on_sale <- c("the killers", "red hot chili peppers", "bloc party", "nofx", "the smiths", "dean martin", "madonna", "belle and sebastian", "britney spears", "spiritualized", "coldplay", "u2", "in flames", "the smashing pumpkins",
             "radiohead", "morrissey", "rush", "kylie minogue", "kate bush", "lady gaga")

I create the items_exclude from item_encoding:

items_exclude <- item_encoding[!(artist_name %in% on_sale)]
items_exclude <- items_exclude$iid

Then we can predict artists for a user and save it to a data.frame:

 new_user_predictions <- model$predict(X_cv_future[2:2, , drop = FALSE],
                                       not_recommend = NULL,
                                       items_exclude = items_exclude,
                                       k = 20)

user_id <- attr(new_user_predictions, "dimnames")
scores <- as.data.frame(attr(new_user_predictions, "scores"))
scores$user_id<- user_id[[1]]
scores <- as.data.table(melt(scores))[, .(user_id, score = value)]

artist <- as.data.table(melt(attr(new_user_predictions, "ids")))[
  , .(user_id = Var1, artist_name = value)]
artist <- artist[, artist_name := as.character(artist_name)]

export <- cbind(scores, artist)[
  order(user_id, -score), .(user_id, artist_name, score)]

export 
                                     user_id            artist_name    score
 1: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc            bloc party 1.0156563
 2: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc             radiohead 0.9866832
 3: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc   belle and sebastian 0.9815670
 4: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc            the smiths 0.9721361
 5: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc           the killers 0.9537831
 6: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc              coldplay 0.9488346
 7: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc the smashing pumpkins 0.9475210
 8: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc             morrissey 0.9432581
 9: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc red hot chili peppers 0.8897018
10: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc                    u2 0.8870862
11: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc               madonna 0.7219695
12: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc                  nofx 0.7113644
13: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc        britney spears 0.6474762
14: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc         spiritualized 0.6349998
15: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc             kate bush 0.6251492
16: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc             lady gaga 0.5709979
17: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc         kylie minogue 0.5597064
18: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc                  rush 0.4292514
19: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc             in flames 0.4156308
20: 4b5ffa7d5485294b81d3c965efaa613f6925c6cc           dean martin 0.2178573

dselivanov / rsparse

How to use item_exclude #23