gsbDBI / torch-choice

Choice modeling with PyTorch: logit model and nested logit model
MIT License
40 stars 9 forks source link

Issue with pandas upgrade #46

Closed TianyuDu closed 8 months ago

TianyuDu commented 8 months ago

Description of the Issue

While running using the following code from the main branch,

import warnings
warnings.filterwarnings("ignore")

import random
from time import time
import numpy as np
import pandas as pd
import torch
import torch_choice
from torch_choice import run
from tqdm import tqdm
from torch_choice.data import ChoiceDataset, JointDataset, utils, load_mode_canada_dataset, load_house_cooling_dataset_v1
from torch_choice.model import ConditionalLogitModel, NestedLogitModel

# set the random seed to enforce reproducibility.
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.use_deterministic_algorithms(True)

car_choice = pd.read_csv("./tutorials/public_datasets/car_choice.csv")
car_choice.head()

user_observable_columns=["gender", "income"]
from torch_choice.utils.easy_data_wrapper import EasyDatasetWrapper
data_wrapper_from_columns = EasyDatasetWrapper(
    main_data=car_choice,
    purchase_record_column='record_id',
    choice_column='purchase',
    item_name_column='car',
    user_index_column='consumer_id',
    session_index_column='session_id',
    user_observable_columns=['gender', 'income'],
    item_observable_columns=['speed'],
    session_observable_columns=['discount'],
    itemsession_observable_columns=['price'])

data_wrapper_from_columns.summary()
dataset = data_wrapper_from_columns.choice_dataset
# ChoiceDataset(label=[], item_index=[885], provided_num_items=[], user_index=[885], session_index=[885], item_availability=[885, 4], item_speed=[4, 1], user_gender=[885, 1], user_income=[885, 1], session_discount=[885, 1], itemsession_price=[885, 4, 1], device=cpu)

Depeneding the pandas version, one may encounter a pandas error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[1], line 27
     25 user_observable_columns=["gender", "income"]
     26 from torch_choice.utils.easy_data_wrapper import EasyDatasetWrapper
---> 27 data_wrapper_from_columns = EasyDatasetWrapper(
     28     main_data=car_choice,
     29     purchase_record_column='record_id',
     30     choice_column='purchase',
     31     item_name_column='car',
     32     user_index_column='consumer_id',
     33     session_index_column='session_id',
     34     user_observable_columns=['gender', 'income'],
     35     item_observable_columns=['speed'],
     36     session_observable_columns=['discount'],
     37     itemsession_observable_columns=['price'])
     39 data_wrapper_from_columns.summary()
     40 dataset = data_wrapper_from_columns.choice_dataset

File ~/Development/torch-choice/torch_choice/utils/easy_data_wrapper.py:142, in EasyDatasetWrapper.__init__(self, main_data, purchase_record_column, item_name_column, choice_column, user_index_column, session_index_column, user_observable_data, item_observable_data, useritem_observable_data, session_observable_data, price_observable_data, itemsession_observable_data, useritemsession_observable_data, user_observable_columns, item_observable_columns, useritem_observable_columns, session_observable_columns, price_observable_columns, itemsession_observable_columns, useritemsession_observable_columns, device)
    135 self.derive_observable_from_main_data(item_observable_columns,
    136                                       user_observable_columns,
    137                                       session_observable_columns,
    138                                       price_observable_columns)
    140 self.observable_data_to_observable_tensors()
--> 142 self.create_choice_dataset()
    143 print('Finished Creating Choice Dataset.')

File ~/Development/torch-choice/torch_choice/utils/easy_data_wrapper.py:303, in EasyDatasetWrapper.create_choice_dataset(self)
    301 if len(np.unique(choice_set_size)) > 1:
    302     print(f'Note: choice sets of different sizes found in different purchase records: {rep}')
--> 303     self.item_availability = self.get_item_availability_tensor()
    304 else:
    305     # None means all items are available.
    306     self.item_availability = None

File ~/Development/torch-choice/torch_choice/utils/easy_data_wrapper.py:349, in EasyDatasetWrapper.get_item_availability_tensor(self)
    347 if self.session_index_column is None:
    348     raise ValueError(f'Item availability cannot be constructed without session index column.')
--> 349 A = self.main_data.pivot(self.session_index_column, self.item_name_column, self.choice_column)
    350 return torch.BoolTensor(~np.isnan(A.values))

TypeError: pivot() takes 1 positional argument but 4 were given

Cause of the Issue

After pandas 2.0.0 upgrade, there was a change in the pivot method (see this issue). The position argument to pivot has been disabled. Previous we could simply write df.pivot("A", "B", "C") but now we need to specify df.pivot(index="A", columns="B", values="C").