lyst / lightfm

A Python implementation of LightFM, a hybrid recommendation algorithm.
Apache License 2.0
4.66k stars 679 forks source link

Temporal splitting #663

Open EdwardALockhart opened 1 year ago

EdwardALockhart commented 1 year ago

For those asking questions about splitting data. Here is what I believe is a suitable method for splitting time-sorted interactions held in df as user-item interactions with implicit strength and item and user metadata. This represents the most rigorous method for testing the algorithm and tuning hyper parameters as it reflects the challenge of using data of the past to predict the future.

# Split by time
num_splits = 4

def tail_split(df, size):
    test_start = int(len(df) * size)
    train_end = int(len(df) - test_start)
    train = df.iloc[:-test_start].copy()
    test = df.iloc[train_end:].copy()
    return train, test

train_all, test = tail_split(df, 0.1)
from sklearn.model_selection import TimeSeriesSplit
cv_splits = [(train, val) for train, val in TimeSeriesSplit(n_splits = num_splits).split(train_all)]
cv_data = [(train_all.iloc[i[0]], train_all.iloc[i[1]]) for i in cv_splits]

# Build knowledge of all user and item possibilities
# Item features
item_features = df[['item', 'Category', 'Division']].drop_duplicates().astype(str)
item_features = item_features.loc[item_features['item'].isin(df['item'])]
item_features.set_index('item', inplace = True)
item_feature_names = set()
for i in item_features.columns:
    item_feature_names.update(item_features[i])
item_features = list(zip(item_features.index, item_features.values.tolist()))
# User features
user_features = df[['user', 'Sector', 'SubSector']].drop_duplicates().astype(str)
user_features = user_features.loc[user_features['user'].isin(df['user'])]
user_features.set_index('user', inplace = True)
user_feature_names = set()
for i in user_features.columns:
    user_feature_names.update(user_features[i])
user_features = list(zip(user_features.index, user_features.values.tolist()))

# Create numeric labels for users, items and features
from lightfm.data import Dataset
dataset = Dataset()
dataset.fit(users = df['user'],
            items = df['item'],
            user_features = list(user_feature_names),
            item_features = list(item_feature_names))
user_features_matrix = dataset.build_user_features(user_features)
item_features_matrix = dataset.build_item_features(item_features)
# Get mappings
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()

# Build datasets for each split
final_cv_data = []
for train, val in cv_data:
    (train, weights) = dataset.build_interactions(list(zip(train['user'], train['item'], train['strength'])))
    (val, weights) = dataset.build_interactions(list(zip(val['user'], val['item'], val['strength'])))
    final_cv_data.append((train, val))

(train_all, weights) = dataset.build_interactions(list(zip(train_all['user'], train_all['item'], train_all['strength'])))
(test, weights) = dataset.build_interactions(list(zip(test['user'], test['item'], test['strength'])))
(all_data, weights) = dataset.build_interactions(list(zip(df['user'], df['item'], df['strength'])))

k = 5 # Num recommendations

# Fit the training set and tune on the validation set
from lightfm import LightFM
from lightfm.evaluation import auc_score
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
def sample_hyperparameters():
    while True:
        yield {"no_components": np.random.randint(10, 100),
               "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
               "loss": np.random.choice(["warp", "bpr"]),
               "learning_rate": np.random.exponential(0.05),
               "item_alpha": np.random.exponential(1e-8),
               "user_alpha": np.random.exponential(1e-8),
               "max_sampled": np.random.randint(5, 15),
               "num_epochs": np.random.randint(5, 150),}
def random_search(cv_data, user_features_matrix, item_features_matrix, k, num_samples, num_threads = 1):
    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")
        model = LightFM(**hyperparams)
        scores = []
        for train, val in cv_data:
            model.fit(train,
                      user_features = user_features_matrix,
                      item_features = item_features_matrix,
                      epochs = num_epochs,
                      num_threads = num_threads,
                      verbose = True)
            recall = recall_at_k(model, val,
                                 train_interactions = train,
                                 user_features = user_features_matrix,
                                 item_features = item_features_matrix,
                                 k = k,
                                 num_threads = num_threads).mean()
            scores.append(recall)
        hyperparams["num_epochs"] = num_epochs
        yield (sum(scores) / len(scores), hyperparams)
(score, hyperparams) = max(random_search(final_cv_data, user_features_matrix, item_features_matrix, k, num_samples = 20), key = lambda x: x[0])

# Fit entire training set
num_epochs = hyperparams.pop("num_epochs")
model = LightFM(**hyperparams)
model.fit(train_all,
          user_features = user_features_matrix,
          item_features = item_features_matrix,
          epochs = num_epochs,
          verbose = True)
donatoaz commented 1 week ago

At the end there, shouldn't you be fitting the all_data? I mean, it seems that you are not making use of the test data for the final fit, which is a waste...