Open davidfstein opened 2 weeks ago
Hey @davidfstein!
I think you found a bug. I will check later in the week or next week with more detail.
In the meantime, if you comment this assertion in your installed version: https://github.com/jrzaurin/pytorch-widedeep/blob/220eb3fa82af0f64e30753ff57d3621e50b9351e/pytorch_widedeep/load_from_folder/wd_dataset_from_folder.py#L70
you should be good to go. Next week I will publish a patch. Thanks for opening the issue!
@davidfstein here you have running code using ONLY a tabular component with "load from folder" functionalities (with the assertion commented out
Commented out assertion (only relevant lines):
if reference is not None:
assert (
img_from_folder is None and text_from_folder is None
), "If reference is not None, 'img_from_folder' and 'text_from_folder' left as None"
self.text_from_folder, self.img_from_folder = self._get_from_reference(
reference
)
else:
# assert (
# text_from_folder is not None and img_from_folder is not None
# ), "If reference is None, 'img_from_folder' and 'text_from_folder' must be not None"
self.text_from_folder = text_from_folder
self.img_from_folder = img_from_folder
Running code:
import numpy as np
import torch
import pandas as pd
from torch.utils.data import DataLoader
from pytorch_widedeep.models import TabMlp, WideDeep
from pytorch_widedeep.training import TrainerFromFolder
from pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_widedeep.preprocessing import ChunkTabPreprocessor
from pytorch_widedeep.load_from_folder import TabFromFolder, WideDeepDatasetFromFolder
use_cuda = torch.cuda.is_available()
if __name__ == "__main__":
train_size = 800
eval_size = 100
test_size = 101
chunksize = 100
n_chunks = int(np.ceil(train_size / chunksize))
data_path = "../tmp_data/airbnb/"
train_fname = "airbnb_sample_train.csv"
eval_fname = "airbnb_sample_eval.csv"
test_fname = "airbnb_sample_test.csv"
target_col = "yield"
cat_embed_cols = [
"host_listings_count",
"neighbourhood_cleansed",
"is_location_exact",
"property_type",
"room_type",
"accommodates",
"bathrooms",
"bedrooms",
"beds",
"guests_included",
"minimum_nights",
"instant_bookable",
"cancellation_policy",
"has_house_rules",
"host_gender",
"accommodates_catg",
"guests_included_catg",
"minimum_nights_catg",
"host_listings_count_catg",
"bathrooms_catg",
"bedrooms_catg",
"beds_catg",
"security_deposit",
"extra_people",
]
cont_cols = ["latitude", "longitude"]
tab_preprocessor = ChunkTabPreprocessor(
embed_cols=cat_embed_cols,
continuous_cols=cont_cols,
n_chunks=n_chunks,
default_embed_dim=8,
verbose=0,
)
for i, chunk in enumerate(
pd.read_csv("/".join([data_path, train_fname]), chunksize=chunksize)
):
print(f"chunk in loop: {i}")
tab_preprocessor.fit(chunk)
train_tab_folder = TabFromFolder(
fname=train_fname,
directory=data_path,
target_col=target_col,
preprocessor=tab_preprocessor,
)
eval_tab_folder = TabFromFolder(fname=eval_fname, reference=train_tab_folder)
test_tab_folder = TabFromFolder(
fname=test_fname, reference=train_tab_folder, ignore_target=True
)
train_dataset_folder = WideDeepDatasetFromFolder(
n_samples=train_size,
tab_from_folder=train_tab_folder,
)
eval_dataset_folder = WideDeepDatasetFromFolder(
n_samples=eval_size,
tab_from_folder=eval_tab_folder,
reference=train_dataset_folder,
)
test_dataset_folder = WideDeepDatasetFromFolder(
n_samples=test_size,
tab_from_folder=test_tab_folder,
reference=train_dataset_folder,
)
train_loader = DataLoader(train_dataset_folder, batch_size=16, num_workers=1)
eval_loader = DataLoader(eval_dataset_folder, batch_size=16, num_workers=1)
test_loader = DataLoader(test_dataset_folder, batch_size=16, num_workers=1)
deepdense = TabMlp(
mlp_hidden_dims=[64, 32],
column_idx=tab_preprocessor.column_idx,
cat_embed_input=tab_preprocessor.cat_embed_input,
continuous_cols=cont_cols,
)
model = WideDeep(deeptabular=deepdense)
callbacks = [EarlyStopping, ModelCheckpoint(filepath="model_weights/wd_out.pt")]
trainer = TrainerFromFolder(
model,
objective="regression",
callbacks=callbacks,
)
trainer.fit(
train_loader=train_loader,
eval_loader=eval_loader,
finetune=True,
finetune_epochs=1,
)
preds = trainer.predict(test_loader=test_loader)
Perfect thank you! With the assertion removed, I am able to successfully begin training
I am attempting to adapt the example from https://github.com/jrzaurin/pytorch-widedeep/blob/master/examples%252Fscripts%252Fairbnb_load_from_folder_regr.py to my dataset which includes tabular and text data. It seems however that WideDeepDatasetFromFolder requires both img and text data in addition to tabular data. When I run
I receive this error:
Can you let me know how to use the load from folder functionality in the case where I only have tab and text data or even only have tab data?
I tried passing the TabFromFolder object directly to the DataLoader, but I don't think this is the correct usage.