Open eddiezhou opened 5 years ago
Hi Eddie, thanks for the issue filing and sorry for the late reply. I have a busy schedule recently but I will look into the problem as soon as possible
@eddiezhou I had the same problem using this project. But I found out why this happens. Well atleast for the dataset that I'm using. When converting the data to the Hashmap(dictionary) it removes any duplicate names because you use the name as value. So when you try to access them in the hashmap with the ID you won't get any results.
Here is my code, I wrote it to recommend beers based on a taste profile of the beer. In the _prep_data def you'll find my way of removing the duplicates for my dataset. I have no idea how this should be done or if this is the issue you have with the movie dataset. But that fixed it for me. I have "perfect" recommendations. Hope this helps!
import os
import time
import gc
import argparse
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz
class KnnRecommender:
"""
This is an item based collaborative filtering recommender with KNN implemented by sklearn
"""
def __init__(self, path_beers, path_tastingprofiles):
"""
Recommender requires path to data: movies data and ratings data
Parameters
----------
path_movies: str, movies data file path
path_ratings: str, ratings data file path
"""
self.path_beers = path_beers
self.path_tastingprofiles = path_tastingprofiles
self.model = NearestNeighbors()
def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None):
"""
set model params for sklearn.neighbors.NearestNeighbors
Parameters
----------
n_neighbors: int, optional (default = 5)
algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
metric: string or callable, default 'minkowski', or one of
['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
n_jobs: int or None, optional (default=None)
"""
if n_jobs and (n_jobs > 1 or n_jobs == -1):
os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
self.model.set_params(**{
'n_neighbors': n_neighbors,
'algorithm': algorithm,
'metric': metric,
'n_jobs': n_jobs})
def _prep_data(self):
"""
prepare data for recommender
1. beer-tastingprofile scipy sparse matrix
2. hashmap of beer to row index in beer-tastingprofile scipy sparse matrix
"""
# read data
df_beers = pd.read_csv(
os.path.join(self.path_beers),
usecols=['beerID', 'name'],
dtype={'beerID': 'int32', 'name': 'str'})
df_tastingprofiles = pd.read_csv(
os.path.join(self.path_tastingprofiles),
usecols=['beerID', 'malty', 'sweet', 'sour', 'hoppy', 'bitter', 'fruity'],
dtype={'beerID': 'int32', 'malty': 'float32', 'sweet': 'float32', 'sour': 'float32', 'hoppy': 'float32', 'bitter': 'float32', 'fruity': 'float32'})
#filtering beers/removing unprofiled beers
#df_beers_merged = pd.merge(df_tastingprofiles, df_beers, on='beerID')
#df_beers = df_beers_merged.drop(['malty', 'sweet', 'sour', 'hoppy', 'bitter', 'fruity'], axis=1)
#Remove duplicates from Beers dataset
df_beers_noduplicates = df_beers.drop_duplicates(subset='name', keep='first', inplace=False)
df_beers_merged = pd.merge(df_tastingprofiles, df_beers_noduplicates, on='beerID')
df_beers = df_beers_merged.drop(['malty', 'sweet', 'sour', 'hoppy', 'bitter', 'fruity'], axis=1)
df_tastingprofiles = df_beers_merged.drop(['name'], axis=1)
# pivot and create tastingprofile matrix
df_tastingprofile_features = df_tastingprofiles.set_index('beerID')
# create mapper from beer name to index
hashmap = {
beer: i for i, beer in
enumerate(list(df_beers.set_index('beerID').loc[df_tastingprofile_features.index].name)) # noqa
}
#converting tastingprofile features to scipy sparse matrix
mat_tastingprofile_features = csr_matrix(df_tastingprofile_features.values)
print("Matrix Shape")
print(mat_tastingprofile_features.shape)
# clean up
del df_beers, #df_beers_merged
del df_tastingprofiles, df_tastingprofile_features
return mat_tastingprofile_features, hashmap
def _fuzzy_matching(self, hashmap, fav_beer):
"""
return the closest match via fuzzy ratio.
If no match found, return None
Parameters
----------
hashmap: dict, map beer name to index of the beer in data
fav_beer: str, name of user input beer
Return
------
index of the closest match
"""
match_tuple = []
# get match
for name, idx in hashmap.items():
ratio = fuzz.ratio(name.lower(), fav_beer.lower())
if ratio >= 60:
match_tuple.append((name, idx, ratio))
# sort
match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
if not match_tuple:
print('Oops! No match is found')
else:
print('Found possible matches in our database: '
'{0}\n'.format([x[0] for x in match_tuple]))
print("debugging that weird error")
print(match_tuple)
return match_tuple[0][1]
def _inference(self, model, data, hashmap,
fav_beer, n_recommendations):
"""
return top n similar beer recommendations based on user's input movie
Parameters
----------
model: sklearn model, knn model
data: beer-tastingprofile matrix
hashmap: dict, map beer name to index of the beer in data
fav_beer: str, name of user input beer
n_recommendations: int, top n recommendations
Return
------
list of top n similar beer recommendations
"""
# fit
model.fit(data)
# get input movie index
print('You have input movie:', fav_beer)
idx = self._fuzzy_matching(hashmap, fav_beer)
# inference
print('Recommendation system start to make inference')
print('......\n')
t0 = time.time()
distances, indices = model.kneighbors(
data[idx],
n_neighbors=n_recommendations+1)
# get list of raw idx of recommendations
raw_recommends = \
sorted(
list(
zip(
indices.squeeze().tolist(),
distances.squeeze().tolist()
)
),
key=lambda x: x[1]
)[:0:-1]
print('It took my system {:.2f}s to make inference \n\
'.format(time.time() - t0))
# return recommendation (beerID, distance)
return raw_recommends
def make_recommendations(self, fav_beer, n_recommendations):
"""
make top n beer recommendations
Parameters
----------
fav_beer: str, name of user input beer
n_recommendations: int, top n recommendations
"""
# get data
mat_tastingprofile_features, hashmap = self._prep_data()
# get recommendations
raw_recommends = self._inference(
self.model, mat_tastingprofile_features, hashmap,
fav_beer, n_recommendations)
# print results
reverse_hashmap = {v: k for k, v in hashmap.items()}
#print("debugging reverse hashmap")
#print(reverse_hashmap)
print('Recommendations for {}:'.format(fav_beer))
for i, (idx, dist) in enumerate(raw_recommends):
#reverse_hashmap[idx]
print('{0}: {1}, with distance of {2}'.format(i+1,reverse_hashmap[idx], dist))
Hi @TijmenElseviers , thank you so much! This well explained the problem.
Hi Kevin, thanks for the nice code.
But I found something which is really weirded. When I want to search recommendations for 'Day After Tomorrow', it seems that it is recommending a movie with movie ID of 8069, which does not even in the training set movie_user_mat_sparse. Thus gives an error when looking up the value for key of 8069 in reverse_mapper, which does not even have a 8069 key.