tensorflow / recommenders

TensorFlow Recommenders is a library for building recommender system models using TensorFlow.
Apache License 2.0
1.82k stars 273 forks source link

How to make predictions by retrieval models with multiple features? #616

Open zeroonesfas opened 1 year ago

zeroonesfas commented 1 year ago

` class UserModel(tf.keras.Model):

def init(self,use_timestamps,use_distance): super().init()

self._use_timestamps = use_timestamps
self._use_distance = use_distance

self.user_embedding = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
    vocabulary=unique_user_ids, mask_token=None),
    tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32)
])

max_tokens = 10_000

self.Preference1_embedding = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_Preference1, mask_token=None),
  tf.keras.layers.Embedding(len(unique_Preference1) + 1, 32)
])

self.Preference1_vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_tokens)

self.Preference1_text_embedding = tf.keras.Sequential([
  self.Preference1_vectorizer,
  tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
  tf.keras.layers.GlobalAveragePooling1D(),
])

self.Preference1_vectorizer.adapt(user.map(lambda x: x["Preference1"]))

self.Preference2_embedding = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_Preference2, mask_token=None),
  tf.keras.layers.Embedding(len(unique_Preference2) + 1, 32)
])

self.Preference2_vectorizer = tf.keras.layers.TextVectorization(
max_tokens=max_tokens)

self.Preference2_text_embedding = tf.keras.Sequential([
  self.Preference2_vectorizer,
  tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
  tf.keras.layers.GlobalAveragePooling1D(),
])

self.Preference2_vectorizer.adapt(user.map(lambda x: x["Preference2"]))

if use_timestamps:
  self.timestamp_embedding = tf.keras.Sequential([
      tf.keras.layers.Discretization(timestamp_buckets.tolist()),
      tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
  ])
  self.normalized_timestamp = tf.keras.layers.Normalization(
      axis=None
  )

  self.normalized_timestamp.adapt(timestamps)

if use_distance:
  self.distance_embedding = tf.keras.Sequential([
      tf.keras.layers.Discretization(Distance_buckets.tolist()),
      tf.keras.layers.Embedding(len(Distance_buckets) + 1, 32),
  ])
  self.normalized_distance = tf.keras.layers.Normalization(
      axis=None
  )

  self.normalized_timestamp.adapt(Distance)

def call(self, inputs):

if not self._use_timestamps:

return self.user_embedding(inputs["user_id"])

return tf.concat([
    self.user_embedding(inputs["UserID"]),
    self.Preference1_embedding(inputs["Preference1"]),
    self.Preference1_text_embedding(inputs["Preference1"]),
    self.Preference2_embedding(inputs["Preference2"]),
    self.Preference2_text_embedding(inputs["Preference2"]),       
    self.timestamp_embedding(inputs["Time"]),
    tf.reshape(self.normalized_timestamp(inputs["Time"]), (-1, 1)),
    self.distance_embedding(inputs["Distance"]),
    tf.reshape(self.normalized_timestamp(inputs["Distance"]), (-1, 1))],
    axis=1)

class LocationModel(tf.keras.Model):

def init(self): super().init()

max_tokens = 10_000

self.location_embedding = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
  vocabulary=unique_location_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_location_ids) + 1, 32)
])

self.locationTag_embedding = tf.keras.Sequential([tf.keras.layers.StringLookup(
  vocabulary=unique_locationTag, mask_token=None),
  tf.keras.layers.Embedding(len(unique_locationTag) + 1, 32)
])

self.locationTag_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_tokens)

self.locationTag_text_embedding = tf.keras.Sequential([
  self.locationTag_vectorizer,
  tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
  tf.keras.layers.GlobalAveragePooling1D(),
])

self.locationTag_vectorizer.adapt(location.map(lambda x: x["Tag of Location visited"]))

self.Description_embedding = tf.keras.Sequential([tf.keras.layers.StringLookup(vocabulary=unique_Description, mask_token=None),
  tf.keras.layers.Embedding(len(unique_Description) + 1, 32)])

self.Description_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_tokens)

self.Description_text_embedding = tf.keras.Sequential([
  self.Description_vectorizer,
  tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
  tf.keras.layers.GlobalAveragePooling1D(),
])

self.Description_vectorizer.adapt(location.map(lambda x: x["Description of Location visited"]))

def call(self, inputs): return tf.concat([ self.location_embedding(inputs['Name of Location']), self.locationTag_embedding(inputs['Tag of Location visited']), self.locationTag_text_embedding(inputs['Tag of Location visited']), self.Description_embedding(inputs['Description of Location visited']), self.Description_text_embedding(inputs['Description of Location visited']), ], axis=1) class TravelModel(tfrs.models.Model):

def init(self, use_timestamps,use_distance): super().init()

self.query_model = tf.keras.Sequential([
  UserModel(use_timestamps,use_distance),
  tf.keras.layers.Dense(32)
])

self.candidate_model = tf.keras.Sequential([
  LocationModel(),
  tf.keras.layers.Dense(32)
])

self.task = tfrs.tasks.Retrieval(
    metrics=tfrs.metrics.FactorizedTopK(
        candidates=location.batch(128).map(self.candidate_model),
    ),
)

def compute_loss(self, features, training=False):

query_embeddings = self.query_model({
    "UserID": features["UserID"],
    "Time": features["Time"],
    "Distance": features["Distance"],
    "Preference1": features["Preference1"],
    "Preference2": features["Preference2"]
})

location_embeddings = self.candidate_model({
    "Name of Location": features["Name of Location"],
    "Tag of Location visited": features["Tag of Location visited"],
    "Description of Location visited": features["Description of Location visited"]
})

return self.task(query_embeddings, location_embeddings)

tf.random.set_seed(42) shuffled = data.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(int(len(data)0.8)) test = shuffled.skip(int(len(data)0.8)).take(int(len(data)*0.2)+1)

cached_train = train.shuffle(100_000).batch(2048) cached_test = test.batch(4096).cache()

model = TravelModel(use_timestamps=True,use_distance=True) model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

model.fit(cached_train, epochs=30)

train_accuracy = model.evaluate( cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"] test_accuracy = model.evaluate( cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]

print(f"Top-100 accuracy (train): {train_accuracy:.2f}.") print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

`

zeroonesfas commented 1 year ago

Anyone can help me? Thank you very much.

patrickorlando commented 1 year ago

Hi @zeroonesfas,

A tip for formatting your issue. Your code is hard to read. I'd suggest you include your code in a code block with syntax highlighting, using the following format.

```python YOUR CODE HERE ```

I'd also suggest that when posting an issue, you create minimal code example, that can be run on it's own. This will make it far more likely that someone will be able to help you.

Before going any further I'd encourage you to follow the official tutorials. This section here describes how to make predictions with a retrieval model.

Try this first and if you're still having trouble, try to be specific about the issue you are facing.

Thanks!

zeroonesfas commented 1 year ago

Thanks for your reply @patrickorlando Below is my retrieval model with multiple features (i.e., UserID, preference1, locationID and locationTags.) How can I make recommendation based on a new UserID and his preferene1 ?

class UserModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    self.user_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
        tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32)
    ])

    max_tokens = 10_000

    self.Preference1_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_Preference1, mask_token=None),
      tf.keras.layers.Embedding(len(unique_Preference1) + 1, 32)
    ])

    self.Preference1_vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_tokens)

    self.Preference1_text_embedding = tf.keras.Sequential([
      self.Preference1_vectorizer,
      tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

    self.Preference1_vectorizer.adapt(user.map(lambda x: x["Preference1"]))

  def call(self, inputs):

    return tf.concat([
        self.user_embedding(inputs["UserID"]),
        self.Preference1_embedding(inputs["Preference1"]),
        axis=1)

class LocationModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    max_tokens = 10_000

    self.location_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
      vocabulary=unique_location_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_location_ids) + 1, 32)
    ])

    self.locationTag_embedding = tf.keras.Sequential([tf.keras.layers.StringLookup(
      vocabulary=unique_locationTag, mask_token=None),
      tf.keras.layers.Embedding(len(unique_locationTag) + 1, 32)
    ])

    self.locationTag_vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=max_tokens)

    self.locationTag_text_embedding = tf.keras.Sequential([
      self.locationTag_vectorizer,
      tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

    self.locationTag_vectorizer.adapt(location.map(lambda x: x["Tag of Location visited"]))

  def call(self, inputs):
    return tf.concat([
        self.location_embedding(inputs['Name of Location']),
        self.locationTag_embedding(inputs['Tag of Location visited']),
    ], axis=1)

class TravelModel(tfrs.models.Model):

  def __init__(self, use_timestamps,use_distance):
    super().__init__()

    self.query_model = tf.keras.Sequential([
      UserModel(use_timestamps,use_distance),
      tf.keras.layers.Dense(32)
    ])

    self.candidate_model = tf.keras.Sequential([
      LocationModel(),
      tf.keras.layers.Dense(32)
    ])

    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=location.batch(128).map(self.candidate_model),
        ),
    )

  def compute_loss(self, features, training=False):

    query_embeddings = self.query_model({
        "UserID": features["UserID"],
        "Preference1": features["Preference1"],
    })

    location_embeddings = self.candidate_model({
        "Name of Location": features["Name of Location"],
        "Tag of Location visited": features["Tag of Location visited"],
    })

    return self.task(query_embeddings, location_embeddings)

`

zeroonesfas commented 1 year ago

When I make prediction, the following error occurs. AttributeError: 'dict' object has no attribute 'shape' Note: Name of Location = LocationID

location = tf_location_dict.map(lambda x: {
"Name of Location": x["Name of Location"],
"Tag of Location visited": x["Tag of Location visited"],
})

index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
index.index_from_dataset(tf.data.Dataset.zip((location.batch(100), location.batch(100).map(model.candidate_model))))
patrickorlando commented 1 year ago

Great @zeroonesfas! This is much easier.

It looks like an issue when your building the index.

If we look at the call signature below

https://github.com/tensorflow/recommenders/blob/main/tensorflow_recommenders/layers/factorized_top_k.py#L174-L195

The function expects a tuple of (candidate_id, candidate_embedding) and in your case, you are passing the full dictionary of features.

So I haven't run this code but the general idea is to ensure you match the expected data structure

location = tf_location_dict.map(lambda x: {
"Name of Location": x["Name of Location"],
"Tag of Location visited": x["Tag of Location visited"],
})

index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
index.index_from_dataset(
  location.batch(100).map(
    lambda x: (x["Name of Location"], model.candidate_model(x))
  )
)

Hope this helps

zeroonesfas commented 1 year ago

It works! @patrickorlando Then, how can we make prediction for a new user in my case? Thanks.

_, location = index(candidates: tf.Tensor,
      identifiers: Optional[tf.Tensor] = None)
patrickorlando commented 1 year ago

Since your model depends on the user_id and has no additional user features, you won't be able to make predictions for users who are not in your training set.

Cold-start recommendations is a big topic and one you'll need to do some research on. It requires thinking about your users, dataset and system design. You might be able to recommend to new users if you have user demographics, self-reported interests or employ a session based recommender model. This maybe be handled by a single model or you might build a different model as part of your broader recommender system architecture.

arthurangelici commented 1 year ago

@patrickorlando thank you for the answer. It help me a lot!! index.index_from_dataset(movies.apply(tf.data.experimental.dense_to_ragged_batch(100)).map(lambda x: (x["movie_title"], model.candidate_model(x)))) Now i would like to generate the embedding of the CandidateModel.. Its that possible? I also have two columns in my candidateModel like @zeroonesfas.. One of them is a ragged tensor (model_genres)

I tried: movie_01 = model.candidate_model.predict({"movie_title": np.array(["Palookaville (1996)"]),"movie_genres":np.array([1,2])})

by this give me this error: AttributeError: 'str' object has no attribute 'shape'