NamartaVij commented 1 year ago

could you please help to resolve this error. I am trying to train a model using Transformers4rec , but this is the error I am getting.

_part of CODE_

**### trainer = tr.Trainer(
    model=model,
    args=training_args,
    schema=schema,
    compute_metrics=True,
)
Using amp fp16 backend
%%time
start_time_window_index = 1
final_time_window_index = 4
for time_index in range(start_time_window_index, final_time_window_index):
    # Set data 
    time_index_train = time_index
    time_index_eval = time_index + 1
    # train_paths = glob.glob(os.path.join(OUTPUT_DIR, f"{time_index_train}/train.parquet"))
    # eval_paths = glob.glob(os.path.join(OUTPUT_DIR, f"{time_index_eval}/valid.parquet"))
    # Train on day related to time_index 
    print('*'*20)
    print("Launch training for day %s are:" %time_index)
    print('*'*20 + '\n')
    trainer.train_dataset_or_path = train_transformed
    trainer.reset_lr_scheduler()
    trainer.train()
    trainer.state.global_step +=1
    # Evaluate on the following day
    trainer.eval_dataset_or_path = valid_transformed
    train_metrics = trainer.evaluate(metric_key_prefix='eval')
    print('*'*20)
    print("Eval results for day %s are:\t" %time_index_eval)
    print('\n' + '*'*20 + '\n')
    for key in sorted(train_metrics.keys()):
        print(" %s = %s" % (key, str(train_metrics[key]))) 
    wipe_memory()**

output Running training Num examples = 600192 Num Epochs = 10 Instantaneous batch size per device = 384 Total train batch size (w. parallel, distributed & accumulation) = 384 Gradient Accumulation steps = 1 Total optimization steps = 15630

Launch training for day 1 are:

Output Schema -> [{'name': 'userId', 'tags': {<Tags.USER: 'user'>, <Tags.ID: 'id'>, <Tags.CATEGORICAL: 'categorical'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': './/categories/unique.userId.parquet', 'domain': {'min': 0, 'max': 6042, 'name': 'userId'}, 'embedding_sizes': {'cardinality': 6043, 'dimension': 210}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'movieId', 'tags': {<Tags.ITEM: 'item'>, <Tags.LIST: 'list'>, <Tags.ID: 'id'>, <Tags.CATEGORICAL: 'categorical'>}, 'properties': {'num_buckets': None, 'freq_threshold': 10, 'max_size': 0, 'cat_path': './/categories/unique.movieId.parquet', 'domain': {'min': 0, 'max': 3103, 'name': 'movieId'}, 'embedding_sizes': {'cardinality': 3104, 'dimension': 144}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'genres', 'tags': {<Tags.LIST: 'list'>, <Tags.CATEGORICAL: 'categorical'>}, 'properties': {'num_buckets': None, 'freq_threshold': 10, 'max_size': 0, 'cat_path': './/categories/unique.genres.parquet', 'domain': {'min': 0, 'max': 20, 'name': 'genres'}, 'embedding_sizes': {'cardinality': 21, 'dimension': 16}, 'value_count': {'min': 0, 'max': None}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None), Dimension(min=0, max=None)))), 'is_list': True, 'is_ragged': True}, {'name': 'binary_rating', 'tags': {<Tags.BINARY_CLASSIFICATION: 'binary_classification'>, <Tags.TARGET: 'target'>}, 'properties': {}, 'dtype': DType(name='bool', element_type=<ElementType.Bool: 'bool'>, element_size=None, element_unit=None, signed=None, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}] Sparse Feats -> ['movieId', 'genres'] Padding Lengths {'movieId': 20, 'genres': 20} Item IDS -> torch.Size([384])

AssertionError Traceback (most recent call last) File :15

File /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:1316, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs) 1314 tr_loss_step = self.training_step(model, inputs) 1315 else: -> 1316 tr_loss_step = self.training_step(model, inputs) 1318 if ( 1319 args.logging_nan_inf_filter 1320 and not is_torch_tpu_available() 1321 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) 1322 ): 1323 # if loss is nan or inf simply add the average of previous logged losses 1324 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:1847, in Trainer.training_step(self, model, inputs) 1845 if self.use_amp: 1846 with autocast(): -> 1847 loss = self.compute_loss(model, inputs) 1848 else: 1849 loss = self.compute_loss(model, inputs)

File /usr/local/lib/python3.8/dist-packages/transformers4rec/torch/trainer.py:323, in Trainer.compute_loss(self, model, inputs, return_outputs) 316 """ 317 Overriding :obj:Trainer.compute_loss() 318 To allow for passing the targets to the model's forward method 319 How the loss is computed by Trainer. By default, all Transformers4Rec models return 320 a dictionary of three elements {'loss', 'predictions', and 'labels} 321 """ 322 inputs, targets = inputs --> 323 outputs = model(inputs, targets=targets, training=True) 324 # Save past state if it exists 325 # TODO: this needs to be fixed and made cleaner later. 326 if self.args.past_index >= 0:

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1533, in Module._call_impl(self, *args, *kwargs) 1528 # If we don't have any hooks, we want to skip the rest of the logic in 1529 # this function, and just call forward. It's slow for dynamo to guard on the state 1530 # of all these hook dicts individually, so instead it can guard on 2 bools and we just 1531 # have to promise to keep them up to date when hooks are added or removed via official means. 1532 if not self._has_hooks and not _has_global_hooks: -> 1533 return forward_call(args, **kwargs) 1534 # Do not call functions when jit is used 1535 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/transformers4rec/torch/model/base.py:560, in Model.forward(self, inputs, targets, training, testing, kwargs) 558 predictions = {} 559 for i, head in enumerate(self.heads): --> 560 head_output = head( 561 inputs, 562 call_body=True, 563 targets=targets, 564 training=training, 565 testing=testing, 566 kwargs, 567 ) 568 labels.update(head_output["labels"]) 569 predictions.update(head_output["predictions"])

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1533, in Module._call_impl(self, *args, *kwargs) 1528 # If we don't have any hooks, we want to skip the rest of the logic in 1529 # this function, and just call forward. It's slow for dynamo to guard on the state 1530 # of all these hook dicts individually, so instead it can guard on 2 bools and we just 1531 # have to promise to keep them up to date when hooks are added or removed via official means. 1532 if not self._has_hooks and not _has_global_hooks: -> 1533 return forward_call(args, **kwargs) 1534 # Do not call functions when jit is used 1535 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/transformers4rec/torch/model/base.py:382, in Head.forward(self, body_outputs, training, testing, targets, call_body, top_k, kwargs) 379 from transformers4rec.torch.model.prediction_task import NextItemPredictionTask 381 if call_body: --> 382 body_outputs = self.body(body_outputs, training=training, testing=testing, kwargs) 384 if training or testing: 385 losses = []

File /usr/local/lib/python3.8/dist-packages/transformers4rec/config/schema.py:50, in SchemaMixin.call(self, *args, kwargs) 47 def call(self, *args, *kwargs): 48 self.check_schema() ---> 50 return super().call(args, kwargs)

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1533, in Module._call_impl(self, *args, *kwargs) 1528 # If we don't have any hooks, we want to skip the rest of the logic in 1529 # this function, and just call forward. It's slow for dynamo to guard on the state 1530 # of all these hook dicts individually, so instead it can guard on 2 bools and we just 1531 # have to promise to keep them up to date when hooks are added or removed via official means. 1532 if not self._has_hooks and not _has_global_hooks: -> 1533 return forward_call(args, **kwargs) 1534 # Do not call functions when jit is used 1535 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/transformers4rec/torch/block/base.py:256, in SequentialBlock.forward(self, input, training, testing, **kwargs) 254 elif "training" in inspect.signature(module.forward).parameters: 255 if "testing" in inspect.signature(module.forward).parameters: --> 256 input = module(input, training=training, testing=testing) 257 else: 258 input = module(input, training=training)

File /usr/local/lib/python3.8/dist-packages/transformers4rec/config/schema.py:50, in SchemaMixin.call(self, *args, kwargs) 47 def call(self, *args, *kwargs): 48 self.check_schema() ---> 50 return super().call(args, kwargs)

File /usr/local/lib/python3.8/dist-packages/transformers4rec/torch/tabular/base.py:392, in TabularModule.call(self, inputs, pre, post, merge_with, aggregation, *args, *kwargs) 389 inputs = self.pre_forward(inputs, transformations=pre) 391 # This will call the forward method implemented by the super class. --> 392 outputs = super().call(inputs, args, **kwargs) # noqa 394 if isinstance(outputs, dict): 395 outputs = self.post_forward( 396 outputs, transformations=post, merge_with=merge_with, aggregation=aggregation 397 )

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1533, in Module._call_impl(self, *args, *kwargs) 1528 # If we don't have any hooks, we want to skip the rest of the logic in 1529 # this function, and just call forward. It's slow for dynamo to guard on the state 1530 # of all these hook dicts individually, so instead it can guard on 2 bools and we just 1531 # have to promise to keep them up to date when hooks are added or removed via official means. 1532 if not self._has_hooks and not _has_global_hooks: -> 1533 return forward_call(args, **kwargs) 1534 # Do not call functions when jit is used 1535 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/transformers4rec/torch/features/sequence.py:262, in TabularSequenceFeatures.forward(self, inputs, training, testing, **kwargs) 259 outputs = self.projection_module(outputs) 261 if self.masking: --> 262 outputs = self.masking( 263 outputs, 264 item_ids=self.to_merge["categorical_module"].item_seq, 265 training=training, 266 testing=testing, 267 ) 269 return outputs

File /usr/local/lib/python3.8/dist-packages/transformers4rec/config/schema.py:50, in SchemaMixin.call(self, *args, kwargs) 47 def call(self, *args, *kwargs): 48 self.check_schema() ---> 50 return super().call(args, kwargs)

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1533, in Module._call_impl(self, *args, *kwargs) 1528 # If we don't have any hooks, we want to skip the rest of the logic in 1529 # this function, and just call forward. It's slow for dynamo to guard on the state 1530 # of all these hook dicts individually, so instead it can guard on 2 bools and we just 1531 # have to promise to keep them up to date when hooks are added or removed via official means. 1532 if not self._has_hooks and not _has_global_hooks: -> 1533 return forward_call(args, **kwargs) 1534 # Do not call functions when jit is used 1535 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/transformers4rec/torch/masking.py:223, in MaskSequence.forward(self, inputs, itemids, training, testing) 216 def forward( 217 self, 218 inputs: torch.Tensor, (...) 221 testing: bool = False, 222 ) -> torch.Tensor: --> 223 = self.compute_masked_targets(item_ids=item_ids, training=training, testing=testing) 224 if self.mask_schema is None: 225 raise ValueError("mask_schema must be set.")

File /usr/local/lib/python3.8/dist-packages/transformers4rec/torch/masking.py:149, in MaskSequence.compute_masked_targets(self, item_ids, training, testing) 131 """ 132 Method to prepare masked labels based on the sequence of item ids. 133 It returns The true labels of masked positions and the related boolean mask. (...) 146 Tuple[MaskingSchema, MaskedTargets] 147 """ 148 print(f'Item IDS -> {item_ids.shape}') --> 149 assert item_ids.ndim == 2, "item_ids must have 2 dimensions." 150 masking_info = self._compute_masked_targets(item_ids, training=training, testing=testing) 151 self.mask_schema, self.masked_targets = masking_info.schema, masking_info.targets

AssertionError: item_ids must have 2 dimensions.

vivpra89 commented 1 year ago

@NamartaVij do you mind posting the input and the architecture as well.. can spend sometime looking at the code with the movielens data.

NamartaVij commented 1 year ago

https://github.com/Rajathbharadwaj/NVTabular-Merlin-T4C-ML/tree/main here the link, just go through. It would be great if you can give your feedback as soon. I have a deadline for this actually

rnyak commented 1 year ago

@NamartaVij please share your movielens nvtabular script? how do you generate sequential data from movielens and how do you tag the columns,we need to know first to repro your issue.

you cannot use transfomers4rec without sequential data. you need to generate user sessions with sequential item-ids. please keep that in mind.

NamartaVij commented 1 year ago

@rnyak yes here it is:

import os
from merlin.datasets.entertainment import get_movielens
input_path = os.environ.get("INPUT_DATA_DIR", os.path.expanduser("~/merlin-framework/movielens/"))
get_movielens(variant="ml-1m", path=input_path); #noqa
from merlin.core.dispatch import get_lib
data = get_lib().read_parquet(f'{input_path}ml-1m/train.parquet').sample(frac=1)
train = data.iloc[:600_000]
valid = data.iloc[600_000:]
movies = get_lib().read_parquet(f'{input_path}ml-1m/movies_converted.parquet')
import nvtabular as nvt
from merlin.schema.tags import Tags
train_ds = nvt.Dataset(train, npartitions=2)
valid_ds = nvt.Dataset(valid)
train_ds, valid_ds
train_ds.shuffle_by_keys('userId')
valid_ds.shuffle_by_keys('userId')
genres = ['movieId'] >> nvt.ops.JoinExternal(movies, on='movieId', columns_ext=['movieId', 'genres']) >> nvt.ops.AddTags(tags=[Tags.LIST])
genres = genres >> nvt.ops.Categorify(freq_threshold=10) 
def rating_to_binary(col):
return col > 3
binary_rating = ['rating'] >> nvt.ops.LambdaOp(rating_to_binary) >> nvt.ops.Rename(name='binary_rating')
userId = ['userId'] >> nvt.ops.Categorify() >> nvt.ops.AddTags(tags=[Tags.USER_ID, Tags.CATEGORICAL, Tags.USER])
movieId = ['movieId'] >> nvt.ops.Categorify() >> nvt.ops.AddTags(tags=[Tags.ITEM_ID, Tags.CATEGORICAL, Tags.ITEM])
binary_rating = binary_rating >> nvt.ops.AddTags(tags=[Tags.TARGET, Tags.BINARY_CLASSIFICATION])
workflow = nvt.Workflow(userId + movieId + genres + binary_rating)
train_transformed = workflow.fit_transform(train_ds)
valid_transformed = workflow.transform(valid_ds)
valid_transformed.compute().head()
train_transformed.schema

2023-07-16 05:15:35.169484: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable TF_ENABLE_ONEDNN_OPTS=0.
2023-07-16 05:15:35.189602: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-16 05:15:35.991441: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-16 05:15:35.991706: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...

Out[1]:

	name	tags	dtype	is_list	is_ragged	properties.num_buckets	properties.freq_threshold	properties.max_size	properties.cat_path	properties.domain.min	properties.domain.max	properties.domain.name	properties.embedding_sizes.cardinality	properties.embedding_sizes.dimension	properties.value_count.min
userId	(Tags.USER, Tags.ID, Tags.CATEGORICAL)	DType(name='int64', element_type=<ElementType....	False	False	NaN	0.0	0.0	.//categories/unique.userId.parquet	0.0	6042.0	userId	6043.0	210.0	NaN	NaN
movieId	(Tags.ITEM, Tags.LIST, Tags.ID, Tags.CATEGORICAL)	DType(name='int64', element_type=<ElementType....	False	False	NaN	10.0	0.0	.//categories/unique.movieId.parquet	0.0	3103.0	movieId	3104.0	144.0	NaN	NaN
genres	(Tags.LIST, Tags.CATEGORICAL)	DType(name='int64', element_type=<ElementType....	True	True	NaN	10.0	0.0	.//categories/unique.genres.parquet	0.0	20.0	genres	21.0	16.0	0.0	NaN
binary_rating	(Tags.BINARY_CLASSIFICATION, Tags.TARGET)	DType(name='bool', element_type=<ElementType.B...	False	False	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

In [3]:

from transformers4rec import torch as tr
max_sequence_length, d_model = 20, 320
schema = train_transformed.schema
# Define input module to process tabular input-features and to prepare masked inputs
input_module = tr.TabularSequenceFeatures.from_schema(
schema,
masking="clm",
max_sequence_length=max_sequence_length,
d_model=d_model,
)
# Define Next item prediction-task 
prediction_task = tr.BinaryClassificationTask()
# Define the config of the XLNet Transformer architecture
transformer_config = tr.XLNetConfig.build(
d_model=d_model, n_head=8, n_layer=2, total_seq_length=max_sequence_length
)
# Get the end-to-end model 
model = transformer_config.to_torch_model(input_module, prediction_task)

(-1, 20, 192)
(-1, 20, 192)

/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm

In [4]:

training_args = tr.trainer.T4RecTrainingArguments(
output_dir="./tmp",
max_sequence_length=20,
data_loader_engine='merlin',
num_train_epochs=10,
dataloader_drop_last=False,
per_device_train_batch_size = 384,
per_device_eval_batch_size = 512,
learning_rate=0.0005,
fp16=True,
report_to = [],
logging_steps=200
)

In [5]:

recsys_trainer = tr.Trainer(
model=model,
args=training_args,
schema=schema,
compute_metrics=True)

Using amp fp16 backend

In [6]:

# Instantiate the T4Rec Trainer, which manages training and evaluation
trainer = tr.Trainer(
model=model,
args=training_args,
schema=schema,
compute_metrics=True,
)

Using amp fp16 backend

In [7]:

%%time
start_time_window_index = 1
final_time_window_index = 4
for time_index in range(start_time_window_index, final_time_window_index):
# Set data 
time_index_train = time_index
time_index_eval = time_index + 1
# train_paths = glob.glob(os.path.join(OUTPUT_DIR, f"{time_index_train}/train.parquet"))
# eval_paths = glob.glob(os.path.join(OUTPUT_DIR, f"{time_index_eval}/valid.parquet"))
# Train on day related to time_index 
print(''20)
print("Launch training for day %s are:" %time_index)
print(''20 + '\n')
trainer.train_dataset_or_path = train_transformed
trainer.reset_lr_scheduler()
trainer.train()
trainer.state.global_step +=1
# Evaluate on the following day
trainer.eval_dataset_or_path = valid_transformed
train_metrics = trainer.evaluate(metric_key_prefix='eval')
print(''20)
print("Eval results for day %s are:\t" %time_index_eval)
print('\n' + ''20 + '\n')
for key in sorted(train_metrics.keys()):
print(" %s = %s" % (key, str(train_metrics[key])))
wipe_memory()

 Running training 
Num examples = 600192
Num Epochs = 10
Instantaneous batch size per device = 384
Total train batch size (w. parallel, distributed & accumulation) = 384
Gradient Accumulation steps = 1
Total optimization steps = 15630

****
Launch training for day 1 are:

Output Schema -> [{'name': 'userId', 'tags': {<Tags.USER: 'user'>, <Tags.ID: 'id'>, <Tags.CATEGORICAL: 'categorical'>}, 'properties': {'num_buckets': None, 'freq_threshold': 0, 'max_size': 0, 'cat_path': './/categories/unique.userId.parquet', 'domain': {'min': 0, 'max': 6042, 'name': 'userId'}, 'embedding_sizes': {'cardinality': 6043, 'dimension': 210}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'movieId', 'tags': {<Tags.ITEM: 'item'>, <Tags.LIST: 'list'>, <Tags.ID: 'id'>, <Tags.CATEGORICAL: 'categorical'>}, 'properties': {'num_buckets': None, 'freq_threshold': 10, 'max_size': 0, 'cat_path': './/categories/unique.movieId.parquet', 'domain': {'min': 0, 'max': 3103, 'name': 'movieId'}, 'embedding_sizes': {'cardinality': 3104, 'dimension': 144}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}, {'name': 'genres', 'tags': {<Tags.LIST: 'list'>, <Tags.CATEGORICAL: 'categorical'>}, 'properties': {'num_buckets': None, 'freq_threshold': 10, 'max_size': 0, 'cat_path': './/categories/unique.genres.parquet', 'domain': {'min': 0, 'max': 20, 'name': 'genres'}, 'embedding_sizes': {'cardinality': 21, 'dimension': 16}, 'value_count': {'min': 0, 'max': None}}, 'dtype': DType(name='int64', element_type=<ElementType.Int: 'int'>, element_size=64, element_unit=None, signed=True, shape=Shape(dims=(Dimension(min=0, max=None), Dimension(min=0, max=None)))), 'is_list': True, 'is_ragged': True}, {'name': 'binary_rating', 'tags': {<Tags.BINARY_CLASSIFICATION: 'binary_classification'>, <Tags.TARGET: 'target'>}, 'properties': {}, 'dtype': DType(name='bool', element_type=<ElementType.Bool: 'bool'>, element_size=None, element_unit=None, signed=None, shape=Shape(dims=(Dimension(min=0, max=None),))), 'is_list': False, 'is_ragged': False}]
Sparse Feats -> ['movieId', 'genres']
Padding Lengths {'movieId': 20, 'genres': 20}
Item IDS -> torch.Size([384])

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
File <timed exec>:15
File /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:1316, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1314         tr_loss_step = self.training_step(model, inputs)
   1315 else:
-> 1316     tr_loss_step = self.training_step(model, inputs)
   1318 if (
   1319     args.logging_nan_inf_filter
   1320     and not is_torch_tpu_available()
   1321     and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
   1322 ):
   1323     # if loss is nan or inf simply add the average of previous logged losses
   1324     tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:1847, in Trainer.training_step(self, model, inputs)
   1845 if self.use_amp:
   1846     with autocast():
-> 1847         loss = self.compute_loss(model, inputs)
   1848 else:
   1849     loss = self.compute_loss(model, inputs)
File /usr/local/lib/python3.8/dist-packages/transformers4rec/torch/trainer.py:323, in Trainer.compute_loss(self, model, inputs, return_outputs)
    316 """
    317 Overriding :obj:Trainer.compute_loss()
    318 To allow for passing the targets to the model's forward method
    319 How the loss is computed by Trainer. By default, all Transformers4Rec models return
    320 a dictionary of three elements {'loss', 'predictions', and 'labels}
    321 """
    322 inputs, targets = inputs
--> 323 outputs = model(inputs, targets=targets, training=True)
    324 # Save past state if it exists
    325 # TODO: this needs to be fixed and made cleaner later.
    326 if self.args.past_index >= 0:
File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1533, in Module._call_impl(self, *args, *kwargs)
   1528 # If we don't have any hooks, we want to skip the rest of the logic in
   1529 # this function, and just call forward.  It's slow for dynamo to guard on the state
   1530 # of all these hook dicts individually, so instead it can guard on 2 bools and we just
   1531 # have to promise to keep them up to date when hooks are added or removed via official means.
   1532 if not self._has_hooks and not _has_global_hooks:
-> 1533     return forward_call(args, kwargs)
   1534 # Do not call functions when jit is used
   1535 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.8/dist-packages/transformers4rec/torch/model/base.py:560, in Model.forward(self, inputs, targets, training, testing, *kwargs)
    558 predictions = {}
    559 for i, head in enumerate(self.heads):
--> 560     head_output = head(
    561         inputs,
    562         call_body=True,
    563         targets=targets,
    564         training=training,
    565         testing=testing,
    566         *kwargs,
    567     )
    568     labels.update(head_output["labels"])
    569     predictions.update(head_output["predictions"])
File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1533, in Module._call_impl(self, *args, *kwargs)
   1528 # If we don't have any hooks, we want to skip the rest of the logic in
   1529 # this function, and just call forward.  It's slow for dynamo to guard on the state
   1530 # of all these hook dicts individually, so instead it can guard on 2 bools and we just
   1531 # have to promise to keep them up to date when hooks are added or removed via official means.
   1532 if not self._has_hooks and not _has_global_hooks:
-> 1533     return forward_call(args, kwargs)
   1534 # Do not call functions when jit is used
   1535 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.8/dist-packages/transformers4rec/torch/model/base.py:382, in Head.forward(self, body_outputs, training, testing, targets, call_body, top_k, *kwargs)
    379 from transformers4rec.torch.model.prediction_task import NextItemPredictionTask
    381 if call_body:
--> 382     body_outputs = self.body(body_outputs, training=training, testing=testing, *kwargs)
    384 if training or testing:
    385     losses = []
File /usr/local/lib/python3.8/dist-packages/transformers4rec/config/schema.py:50, in SchemaMixin.call(self, *args, *kwargs)
     47 def call(self, args, kwargs):
     48     self.check_schema()
---> 50     return super().call(args, *kwargs)
File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1533, in Module._call_impl(self, *args, *kwargs)
   1528 # If we don't have any hooks, we want to skip the rest of the logic in
   1529 # this function, and just call forward.  It's slow for dynamo to guard on the state
   1530 # of all these hook dicts individually, so instead it can guard on 2 bools and we just
   1531 # have to promise to keep them up to date when hooks are added or removed via official means.
   1532 if not self._has_hooks and not _has_global_hooks:
-> 1533     return forward_call(args, kwargs)
   1534 # Do not call functions when jit is used
   1535 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.8/dist-packages/transformers4rec/torch/block/base.py:256, in SequentialBlock.forward(self, input, training, testing, **kwargs)
    254 elif "training" in inspect.signature(module.forward).parameters:
    255     if "testing" in inspect.signature(module.forward).parameters:
--> 256         input = module(input, training=training, testing=testing)
    257     else:
    258         input = module(input, training=training)
File /usr/local/lib/python3.8/dist-packages/transformers4rec/config/schema.py:50, in SchemaMixin.call(self, *args, *kwargs)
     47 def call(self, args, kwargs):
     48     self.check_schema()
---> 50     return super().call(args, *kwargs)
File /usr/local/lib/python3.8/dist-packages/transformers4rec/torch/tabular/base.py:392, in TabularModule.call(self, inputs, pre, post, merge_with, aggregation, *args, *kwargs)
    389 inputs = self.pre_forward(inputs, transformations=pre)
    391 # This will call the forward method implemented by the super class.
--> 392 outputs = super().call(inputs, args, kwargs)  # noqa
    394 if isinstance(outputs, dict):
    395     outputs = self.post_forward(
    396         outputs, transformations=post, merge_with=merge_with, aggregation=aggregation
    397     )
File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1533, in Module._call_impl(self, *args, *kwargs)
   1528 # If we don't have any hooks, we want to skip the rest of the logic in
   1529 # this function, and just call forward.  It's slow for dynamo to guard on the state
   1530 # of all these hook dicts individually, so instead it can guard on 2 bools and we just
   1531 # have to promise to keep them up to date when hooks are added or removed via official means.
   1532 if not self._has_hooks and not _has_global_hooks:
-> 1533     return forward_call(args, kwargs)
   1534 # Do not call functions when jit is used
   1535 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.8/dist-packages/transformers4rec/torch/features/sequence.py:262, in TabularSequenceFeatures.forward(self, inputs, training, testing, **kwargs)
    259     outputs = self.projection_module(outputs)
    261 if self.masking:
--> 262     outputs = self.masking(
    263         outputs,
    264         item_ids=self.to_merge["categorical_module"].item_seq,
    265         training=training,
    266         testing=testing,
    267     )
    269 return outputs
File /usr/local/lib/python3.8/dist-packages/transformers4rec/config/schema.py:50, in SchemaMixin.call(self, *args, *kwargs)
     47 def call(self, args, kwargs):
     48     self.check_schema()
---> 50     return super().call(args, *kwargs)
File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1533, in Module._call_impl(self, *args, *kwargs)
   1528 # If we don't have any hooks, we want to skip the rest of the logic in
   1529 # this function, and just call forward.  It's slow for dynamo to guard on the state
   1530 # of all these hook dicts individually, so instead it can guard on 2 bools and we just
   1531 # have to promise to keep them up to date when hooks are added or removed via official means.
   1532 if not self._has_hooks and not _has_global_hooks:
-> 1533     return forward_call(args, kwargs)
   1534 # Do not call functions when jit is used
   1535 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.8/dist-packages/transformers4rec/torch/masking.py:223, in MaskSequence.forward(self, inputs, itemids, training, testing)
    216 def forward(
    217     self,
    218     inputs: torch.Tensor,
   (...)
    221     testing: bool = False,
    222 ) -> torch.Tensor:
--> 223      = self.compute_masked_targets(item_ids=item_ids, training=training, testing=testing)
    224     if self.mask_schema is None:
    225         raise ValueError("mask_schema must be set.")
File /usr/local/lib/python3.8/dist-packages/transformers4rec/torch/masking.py:149, in MaskSequence.compute_masked_targets(self, item_ids, training, testing)
    131 """
    132 Method to prepare masked labels based on the sequence of item ids.
    133 It returns The true labels of masked positions and the related boolean mask.
   (...)
    146 Tuple[MaskingSchema, MaskedTargets]
    147 """
    148 print(f'Item IDS -> {item_ids.shape}')
--> 149 assert item_ids.ndim == 2, "item_ids must have 2 dimensions."
    150 masking_info = self._compute_masked_targets(item_ids, training=training, testing=testing)
    151 self.mask_schema, self.masked_targets = masking_info.schema, masking_info.targets
AssertionError: item_ids must have 2 dimensions.

In [8]:

Tags.LIST

Out[8]:

<Tags.LIST: 'list'>

In [9]:

[col.tags for col in workflow.output_schema if Tags.LIST in col.tags]

Out[9]:

[{<Tags.ITEM: 'item'>, <Tags.LIST: 'list'>, <Tags.ID: 'id'>, <Tags.CATEGORICAL: 'categorical'>},
{<Tags.LIST: 'list'>, <Tags.CATEGORICAL: 'categorical'>}]

rnyak commented 1 year ago

@NamartaVij this wont work. your movieID column is not list. you cannot train a sequential model without sequential data. you need to generate sequential data. Like this if you are predicting the next movie to watch, your movie is your ITEM ID and it should be a sequential data.

session_id     movie_id           
1                    [1, 2, 3]
2                    [2, 5, 7, 8]
3                   [1, 5]

NamartaVij commented 1 year ago

@rnyak yes yes , thankyou I got it.

Apart from this, may I know why we calculate NDCG always, why not diversity

rnyak commented 10 months ago

@NamartaVij we did implement commonly reported evaluation metrics. you can create your own custom metrics as well.

NVIDIA-Merlin / Transformers4Rec

set schema for movielens dataset #735