worldbank / REaLTabFormer

A suite of auto-regressive and Seq2Seq (sequence-to-sequence) transformer models for tabular and relational synthetic data generation.
https://worldbank.github.io/REaLTabFormer/
MIT License
212 stars 24 forks source link

Crash using model.predict() to predict a column #81

Open I-Mamalikidis opened 4 months ago

I-Mamalikidis commented 4 months ago

I'm using Jupter Notebook 6.5.7, with REaLTabFormer 0.1.7, pandas 2.2.2, numpy 1.26.3 on Windows 11 23H2

I have a table which is the result of fully joining a SQL schema.

import pandas as pd

primary_keys = ["customer.id", "contract.id", "invoice.id", "payment.id"]

data = {
    "customer.id": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    "customer.income": [110, 110, 110, 110, 110, 110, 110, 110, 110, 110],
    "contract.id": [1, 1, 1, 1, 1, 2, 2, 2, 2, 2],
    "contract.income": [20, 20, 20, 20, 20, 40, 40, 40, 40, 40],
    "invoice.id": [1, 1, 1, 2, 2, 3, 3, 3, 3, 4],
    "invoice.greater_than_contract.income": [111, 111, 111, 70, 70, 70, 35, 35, 35, 10],
    "payment.id": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "payment.amount": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    "payment.random": [10, 3432, 564, 34, 5432, 564564, 34, 432, 65, 7564345]
}
source_join_df = pd.DataFrame(data)
source_join_df

I'm training a tabular model and generate some synthetic data

import torch
import shutil
from realtabformer import REaLTabFormer
def _get_device():
    return 'cpu' if torch.cuda.device_count() == 0 else 'cuda'
def _sample_parent_model(model, n_samples, gen_batch, device):
    return model.sample(n_samples=n_samples, gen_batch=gen_batch, device=device)

training_execution_params = {
   "table_training": {
      "n_epochs": 3,
      "batch_size": 8,
      "n_gradient_accumulation_steps": 1,
      "train_size": 0.8,
      "early_stopping_patience": 5,
      "early_stopping_threshold": 0.0
   }
}

table_training_params = training_execution_params['table_training']

parent_model = REaLTabFormer(model_type="tabular",
                             batch_size=table_training_params['batch_size'],
                             epochs=table_training_params['n_epochs'],
                             gradient_accumulation_steps=table_training_params['n_gradient_accumulation_steps'],
                             logging_strategy="epoch",
                             evaluation_strategy="epoch",
                             save_strategy="epoch",
                             train_size=table_training_params['train_size'],
                             early_stopping_patience=table_training_params['early_stopping_patience'],
                             early_stopping_threshold=table_training_params['early_stopping_threshold'],
                             checkpoints_dir = f't0_checkpoints')

trainer = parent_model.fit(df=source_join_df.drop(columns = primary_keys),
                           n_critic=0,
                           device=_get_device())

try:
    shutil.rmtree(f'green_model')
except:
    pass
parent_model.save(f"green_model")

generation_execution_params = {
   "table_generation": {
      "batch_size": 8,
      "main_entity_table_n_samples": len(source_join_df)
   }
}

table_generation_params = generation_execution_params['table_generation']
lr_synth_data = _sample_parent_model(model = parent_model,
                                      n_samples=table_generation_params['main_entity_table_n_samples'],
                                      gen_batch=table_generation_params['batch_size'],
                                      device=_get_device())

I'm using the newly created synthetic data of the fully joined table to patch things on an original table and the resulting dataframe is patch_v1. For simplicity, let's say that everything is patched and patch_v1 is equal to the synthetic data

patch_v1 = lr_synth_data.copy()#.astype(str)
# patch_v1["customer.income"] = SpecialTokens.UNK
# patch_v1["contract.income"] = SpecialTokens.UNK
display(patch_v1)
display(patch_v1.dtypes)

Ideally, I want to be able to keep multiple columns frozen whilst predicting values for multiple other columns, but for simplicity let's say I want to predict 1 column and so I use the target_col argument of model.predict().

patch_v1 = parent_model.predict(patch_v1, target_col = "customer.income", batch = table_generation_params['batch_size'], device = _get_device())
display(patch_v1)

I'm getting AttributeError: Can only use .str accessor with string values! from process_data in realtabformer.data_utils.


AttributeError Traceback (most recent call last) Cell In[10], line 1 ----> 1 patch_v1 = parent_model.predict(patch_v1, target_col = "customer.income", batch = table_generation_params['batch_size'], device = _get_device()) 2 display(patch_v1)

File C:\A3\envs\k2view\lib\site-packages\realtabformer\realtabformer.py:1376, in REaLTabFormer.predict(self, data, target_col, target_pos_val, batch, obs_sample, fillunk, device, disable_progress_bar, generate_kwargs) 1357 tabular_sampler = TabularSampler.sampler_from_model(self, device=device) 1359 # TabularSampler( 1360 # model_type=self.model_type, 1361 # model=self.model, (...) 1373 # device=device, 1374 # ) -> 1376 return tabular_sampler.predict( 1377 data=data, 1378 target_col=target_col, 1379 target_pos_val=target_pos_val, 1380 batch=batch, 1381 obs_sample=obs_sample, 1382 fillunk=fillunk, 1383 device=device, 1384 disable_progress_bar=disable_progress_bar, 1385 generate_kwargs, 1386 )

File C:\A3\envs\k2view\lib\site-packages\realtabformer\rtf_sampler.py:722, in TabularSampler.predict(self, data, target_col, target_pos_val, batch, obs_sample, fillunk, device, disable_progress_bar, **generate_kwargs) 719 datasets.utils.disable_progress_bar() 721 for i in range(0, len(data), batch): --> 722 seed_data = self._process_seed_input(data.iloc[i : i + batch]) 723 if fillunk: 724 mode = seed_data.mode(dim=0).values

File C:\A3\envs\k2view\lib\site-packages\realtabformer\rtf_sampler.py:584, in TabularSampler._process_seed_input(self, seed_input) 580 seed_input = pd.DataFrame.from_dict({0: seed_input}, orient="index") 582 seed_input = seed_input[valid_cols] --> 584 seeddata, = process_data( 585 df=seed_input, col_transform_data=self.col_transform_data 586 ) 587 seed_data = make_dataset(seed_data, self.vocab, mask_rate=0, affix_eos=False) 589 generated = torch.tensor(seed_data["input_ids"])

File C:\A3\envs\k2view\lib\site-packages\realtabformer\data_utils.py:538, in process_data(df, numeric_max_len, numeric_precision, numeric_nparts, first_col_type, col_transform_data, target_col) 523 processed_df = pd.concat( 524 [ 525 processed_df, (...) 533 axis=1, 534 ) 536 # Get the different sets of column types 537 cat_cols = processed_df.columns[ --> 538 processed_df.columns.str.contains(ColDataType.CATEGORICAL) 539 ] 540 numeric_cols = processed_df.columns[ 541 ~processed_df.columns.str.contains(ColDataType.CATEGORICAL) 542 ] 544 if first_col_type == ColDataType.CATEGORICAL:

File C:\A3\envs\k2view\lib\site-packages\pandas\core\accessor.py:224, in CachedAccessor.get(self, obj, cls) 221 if obj is None: 222 # we're accessing the attribute of the class, i.e., Dataset.geo 223 return self._accessor --> 224 accessor_obj = self._accessor(obj) 225 # Replace the property with the accessor object. Inspired by: 226 # https://www.pydanny.com/cached-property.html 227 # We need to use object.setattr because we overwrite setattr on 228 # NDFrame 229 object.setattr(obj, self._name, accessor_obj)

File C:\A3\envs\k2view\lib\site-packages\pandas\core\strings\accessor.py:191, in StringMethods.init(self, data) 188 def init(self, data) -> None: 189 from pandas.core.arrays.string_ import StringDtype --> 191 self._inferred_dtype = self._validate(data) 192 self._is_categorical = isinstance(data.dtype, CategoricalDtype) 193 self._is_string = isinstance(data.dtype, StringDtype)

File C:\A3\envs\k2view\lib\site-packages\pandas\core\strings\accessor.py:245, in StringMethods._validate(data) 242 inferred_dtype = lib.infer_dtype(values, skipna=True) 244 if inferred_dtype not in allowed_types: --> 245 raise AttributeError("Can only use .str accessor with string values!") 246 return inferred_dtype

AttributeError: Can only use .str accessor with string values!

What would be the correct way, given a trained model and a dataframe with identical schema to the training data, to predict 1 (or more) column(s) (simultaneously)?