awarebayes / RecNN

Reinforced Recommendation toolkit built around pytorch 1.7
Apache License 2.0
577 stars 114 forks source link

getting error: Expected sequence or array-like, got <class 'NoneType'> #18

Closed lauracrln closed 3 years ago

lauracrln commented 3 years ago

helo, im using my own dataset

i keep getting error

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-153-65892292bdab> in <module>()
     10 )
     11 
---> 12 env = recnn.data.env.FrameEnv(dirs, frame_size, batch_size,prepare_dataset= prepare_my_dataset)

4 frames
/usr/local/lib/python3.6/dist-packages/sklearn/utils/validation.py in _num_samples(x)
    189             x = np.asarray(x)
    190         else:
--> 191             raise TypeError(message)
    192 
    193     if hasattr(x, 'shape') and x.shape is not None:

TypeError: Expected sequence or array-like, got <class 'NoneType'>

Everytime i tried

env = recnn.data.env.FrameEnv(dirs, frame_size, batch_size,prepare_dataset= prepare_my_dataset)

this is my prepare dataset function:

import numpy as np
import datetime
import random
import time

def string_time_to_unix(s):
  return int(time.mktime(datetime.datetime.strptime(s, "%H:%M:%S").timetuple()))

def prepare_my_dataset(args_mut, kwargs):

  frame_size = kwargs.get('frame_size')
  key_to_id = args_mut.base.key_to_id
  df = args_mut.df

  df['vendor_rating'] = df['vendor_rating'].apply(lambda i: 2 * (i - 2.5))
  df['timestamp'] = df['timestamp'].apply(string_time_to_unix)
  df['vendor_id'] = df['vendor_id'].apply(key_to_id.get)

  customer = df[['customer_id', 'vendor_id']].groupby(['customer_id']).size()
  customer = customer[customer > frame_size].sort_values(ascending=False).index

  ratings = df.sort_values(by='timestamp').set_index('customer_id').drop('timestamp', axis=1).groupby('customer_id')

  cust_dict ={}

  def app(x):
    customer_id = x.index[0]
    cust_dict [int(customer_id)] = {}
    cust_dict[int(customer_id)]['items'] = x['vendor_id'].values
    cust_dict[int(customer_id)]['ratings']= x['vendor_rating'].values

  ratings.apply(app)

  args_mut.cust_dict = cust_dict
  args_mut.customer = customer

  return args_mut,kwargs

this is how my customer looks like : Int64Index([199, 62, 72, 71, 70, 69, 68, 67, 66, 65, ... 135, 134, 133, 132, 131, 130, 129, 128, 127, 0], dtype='int64', name='customer_id', length=200)

and this is how a little of my cust_dict looks like :

`{0: {'items': array([221, 225, 237, 250, 259, 265, 271, 274, 288, 289, 294, 295, 298, 299, 300, 304, 356, 386, 391, 398, 401, 419, 459, 537, 547, 573, 575, 216, 577, 207, 201, 90, 92, 104, 105, 106, 110, 113, 115, 134, 145, 148, 149, 154, 157, 159, 160, 161, 176, 180, 188, 189, 191, 192, 193, 195, 197, 199, 203, 86, 85, 83, 84, 4, 13, 20, 23, 28, 33, 43, 44, 55, 66, 67, 75, 76, 78, 79, 81, 82]), 'ratings': array([3.4, 3.4, 4.2, 4. , 3.6, 3.6, 4. , 2.4, 4.2, 4. , 3.8, 4.4, 4.4, 3.4, 3.8, 3. , 3.4, 4. , 3.4, 3.4, 4. , 3.4, 3.4, 3.8, 3.8, 4.2, 3.2, 4.4, 4. , 3.2, 3. , 3.8, 4.2, 4. , 4. , 4. , 4.2, 4.4, 4.6,

  1. , 1.4, 3.2, 3.4, 4. , 3.6, 4. , 3.6, 3.4, 3.6, 2.6, 4.2, 3.6,
  2. , 3.6, 3.6, 3.4, 3.8, 4. , 3. , 4. , 4.2, 3.4, 3.6, 3.8, 4.4,
  3. , 4. , 3.8, 4.2, 3.6, 3.6, 4. , 3. , 3.6, 4.2, 4.2, 3.8, 4.4, 2.6, 3.8])}, 1: {'items': array([115, 134, 145, 148, 149, 154, 157, 159, 160, 161, 176, 180, 188, 189, 191, 192, 193, 195, 197, 199, 201, 203, 207, 216, 221, 225, 237, 113, 250, 110, 105, 4, 13, 20, 23, 28, 33, 43, 44, 55, 66, 67, 75, 76, 78, 79, 81, 82, 83, 84, 85, 86, 90, 92, 104, 106, 259, 265, 271, 274, 288, 289, 294, 295, 298, 299, 300, 304, 356, 386, 391, 398, 401, 459, 537, 547, 573, 575, 577, 419]), 'ratings': array([4.6, 4. , 1.4, 3.2, 3.4, 4. , 3.6, 4. , 3.6, 3.4, 3.6, 2.6, 4.2, 3.6, 4. , 3.6, 3.6, 3.4, 3.8, 4. , 3. , 3. , 3.2, 4.4, 3.4, 3.4, 4.2, 4.4, 4. , 4.2, 4. , 3.8, 4.4, 4. , 4. , 3.8, 4.2, 3.6, 3.6,
  4. , 3. , 3.6, 4.2, 4.2, 3.8, 4.4, 2.6, 3.8, 3.4, 3.6, 4.2, 4. , 3.8, 4.2, 4. , 4. , 3.6, 3.6, 4. , 2.4, 4.2, 4. , 3.8, 4.4, 4.4, 3.4, 3.8, 3. , 3.4, 4. , 3.4, 3.4, 4. , 3.4, 3.8, 3.8, 4.2, 3.2,
  5. , 3.4])}`

can you help me?

awarebayes commented 3 years ago

Can you please paste the full call stack, I cant see where the error is happening

lauracrln commented 3 years ago

do you mean like this?

fields = ['customer_id', 'vendor_id', 'vendor_rating']
ratings = pd.read_csv(root_path + "train_200_800_final.csv", usecols= fields)

ratings['timestamp'] = '00:00:00'

from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder()
ratings['customer_id']= label_encoder.fit_transform(ratings['customer_id'])

df = pd.DataFrame(data=ratings)
df.to_csv('ratings_rest.csv',index=False)

restaurant_data = pd.read_csv(root_path + "train_200_800_final.csv")
restaurant_data = pd.DataFrame(data=restaurant_data)

from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder()
restaurant_data['customer_id']= label_encoder.fit_transform(restaurant_data['customer_id'])

import torch

tensor_data = torch.tensor(restaurant_data.values)

tensor_data.shape

rest_list = dict([(k,torch.tensor(tensor_data[i]).float()) for k, i in zip(restaurant_data.index, range(tensor_data.shape[0]))])

import pickle
pickle.dump(rest_list,open('rest_dataset.pkl' ,'wb'))
import numpy as np
import datetime
import random
import time

def string_time_to_unix(s):
  return int(time.mktime(datetime.datetime.strptime(s, "%H:%M:%S").timetuple()))

def prepare_my_dataset(args_mut, kwargs):

  frame_size = kwargs.get('frame_size')
  key_to_id = args_mut.base.key_to_id
  df = args_mut.df

  df['vendor_rating'] = df['vendor_rating'].apply(lambda i: 2 * (i - 2.5))
  df['timestamp'] = df['timestamp'].apply(string_time_to_unix)
  df['vendor_id'] = df['vendor_id'].apply(key_to_id.get)

  customer = df[['customer_id', 'vendor_id']].groupby(['customer_id']).size()
  customer = customer[customer > frame_size].sort_values(ascending=False).index

  ratings = df.sort_values(by='timestamp').set_index('customer_id').drop('timestamp', axis=1).groupby('customer_id')

  cust_dict ={}

  def app(x):
    customer_id = x.index[0]
    cust_dict [int(customer_id)] = {}
    cust_dict[int(customer_id)]['items'] = x['vendor_id'].values
    cust_dict[int(customer_id)]['ratings']= x['vendor_rating'].values

  ratings.apply(app)

  print(customer)
  args_mut.cust_dict = cust_dict
  args_mut.customer = customer

  return args_mut,kwargs
frame_size = 10
batch_size = 25

dirs = recnn.data.env.DataPath(
    base="/content/",
    embeddings="rest_dataset.pkl",
    ratings='ratings_rest.csv',
    cache="cached_frame_env.pkl", # cache will generate after you run
    use_cache=True
)

env = recnn.data.env.FrameEnv(dirs, frame_size, batch_size,prepare_dataset= prepare_my_dataset)
awarebayes commented 3 years ago

No the error message shows the functions called, where the exception happened Paste the call stack

lauracrln commented 3 years ago
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-166-65892292bdab> in <module>()
     10 )
     11 
---> 12 env = recnn.data.env.FrameEnv(dirs, frame_size, batch_size,prepare_dataset= prepare_my_dataset)

4 frames
/usr/local/lib/python3.6/dist-packages/recnn/data/env.py in __init__(self, path, frame_size, batch_size, num_workers, *args, **kwargs)
    216         kwargs["frame_size"] = frame_size
    217         super(FrameEnv, self).__init__(
--> 218             path, min_seq_size=frame_size + 1, *args, **kwargs
    219         )
    220 

/usr/local/lib/python3.6/dist-packages/recnn/data/env.py in __init__(self, path, prepare_dataset, embed_batch, **kwargs)
    135             self.load_env(path.cache)
    136         else:
--> 137             self.process_env(path)
    138             if path.use_cache:
    139                 self.save_env(path.cache)

/usr/local/lib/python3.6/dist-packages/recnn/data/env.py in process_env(self, path, **kwargs)
    175         user_dict = process_args_mut.user_dict
    176 
--> 177         train_users, test_users = train_test_split(users, test_size=test_size)
    178         train_users = utils.sort_users_itemwise(user_dict, train_users)[2:]
    179         test_users = utils.sort_users_itemwise(user_dict, test_users)

/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_split.py in train_test_split(*arrays, **options)
   2127     arrays = indexable(*arrays)
   2128 
-> 2129     n_samples = _num_samples(arrays[0])
   2130     n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size,
   2131                                               default_test_size=0.25)

/usr/local/lib/python3.6/dist-packages/sklearn/utils/validation.py in _num_samples(x)
    189             x = np.asarray(x)
    190         else:
--> 191             raise TypeError(message)
    192 
    193     if hasattr(x, 'shape') and x.shape is not None:

TypeError: Expected sequence or array-like, got <class 'NoneType'>
lauracrln commented 3 years ago

i already found the problem, im so sorry to bother you, turns out i used wrong name variable on prepare dataset

thank you so muchhh !

awarebayes commented 3 years ago

No Problem!