UKPLab / sentence-transformers

State-of-the-Art Text Embeddings
https://www.sbert.net
Apache License 2.0
15.47k stars 2.5k forks source link

AttributeError: 'NoneType' object has no attribute 'get' #2705

Open karan842 opened 6 months ago

karan842 commented 6 months ago

I am trying to run below code from @tomaarsen's HuggingFace blog on Sentence-Transformer-V3

Code:


from datasets import load_dataset

data = load_dataset('sentence-transformers/stsb')

train_data = data['train'].select(range(100))
val_data = data['validation'].select(range(100, 30))

from torch.utils.data import DataLoader
from sentence_transformers import(
    SentenceTransformer, models,
    losses, util,
    InputExample, evaluation,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments
)

from sentence_transformers.training_args import BatchSamplers
from torch.optim import AdamW
from accelerate import Accelerator

def main():
  word_embedding_model = models.Transformer('mixedbread-ai/mxbai-embed-large-v1')
  pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
  model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

  loss = losses.AnglELoss(model)

  evaluator = evaluation.EmbeddingSimilarityEvaluator(
      sentences1=val_data['sentence1'],
      sentences2=val_data['sentence2'],
      scores=val_data['score'],
      main_similarity=evaluation.SimilarityFunction.COSINE,
      name="sts-dev"
  )

  training_args = SentenceTransformerTrainingArguments(
      output_dir='./sbert-checkpoint',
      num_train_epochs=10,
      seed=33,
      per_device_train_batch_size=8,
      per_device_eval_batch_size=8,
      learning_rate=2e-5,
      # fp16=True
      warmup_ratio=0.1,
      eval_strategy="steps",
      eval_steps=2,
      save_total_limit=2,
      load_best_model_at_end=True,
      metric_for_best_model='spearman_cosine',
      greater_is_better=True

  )

  trainer = SentenceTransformerTrainer(
      model=model,
      evaluator=evaluator,
      args=training_args,
      train_dataset=train_data,
      eval_dataset=val_data,
      loss=loss
  )
  trainer.train()

main()

This code was working on pre-release==3.0.0 version but not on both sentence-transformers[train], sentence-transformera[dev]

Error:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
[<ipython-input-5-1470a0756fb4>](https://localhost:8080/#) in <cell line: 43>()
     41     loss=loss
     42 )
---> 43 trainer.train()

7 frames
[/usr/local/lib/python3.10/dist-packages/transformers/trainer.py](https://localhost:8080/#) in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1883                 hf_hub_utils.enable_progress_bars()
   1884         else:
-> 1885             return inner_training_loop(
   1886                 args=args,
   1887                 resume_from_checkpoint=resume_from_checkpoint,

[/usr/local/lib/python3.10/dist-packages/transformers/trainer.py](https://localhost:8080/#) in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   2289                     self.control = self.callback_handler.on_step_end(args, self.state, self.control)
   2290 
-> 2291                     self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
   2292                 else:
   2293                     self.control = self.callback_handler.on_substep_end(args, self.state, self.control)

[/usr/local/lib/python3.10/dist-packages/transformers/trainer.py](https://localhost:8080/#) in _maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
   2719         metrics = None
   2720         if self.control.should_evaluate:
-> 2721             metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
   2722             self._report_to_hp_search(trial, self.state.global_step, metrics)
   2723 

[/usr/local/lib/python3.10/dist-packages/sentence_transformers/trainer.py](https://localhost:8080/#) in evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
    367         if isinstance(eval_dataset, DatasetDict) and isinstance(self.loss, dict):
    368             eval_dataset = self.add_dataset_name_column(eval_dataset)
--> 369         return super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
    370 
    371     def evaluation_loop(

[/usr/local/lib/python3.10/dist-packages/transformers/trainer.py](https://localhost:8080/#) in evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
   3570 
   3571         eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-> 3572         output = eval_loop(
   3573             eval_dataloader,
   3574             description="Evaluation",

[/usr/local/lib/python3.10/dist-packages/sentence_transformers/trainer.py](https://localhost:8080/#) in evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
    377         metric_key_prefix: str = "eval",
    378     ) -> EvalLoopOutput:
--> 379         output = super().evaluation_loop(
    380             dataloader=dataloader,
    381             description=description,

[/usr/local/lib/python3.10/dist-packages/transformers/trainer.py](https://localhost:8080/#) in evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
   3755 
   3756             # Prediction step
-> 3757             loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
   3758             main_input_name = getattr(self.model, "main_input_name", "input_ids")
   3759             inputs_decode = self._prepare_input(inputs[main_input_name]) if args.include_inputs_for_metrics else None

[/usr/local/lib/python3.10/dist-packages/transformers/trainer.py](https://localhost:8080/#) in prediction_step(self, model, inputs, prediction_loss_only, ignore_keys)
   3926         # If `return_loss` is not specified or being `None` in `inputs`, we check if the default value of `return_loss`
   3927         # is `True` in `model.forward`.
-> 3928         return_loss = inputs.get("return_loss", None)
   3929         if return_loss is None:
   3930             return_loss = self.can_return_loss

AttributeError: 'NoneType' object has no attribute 'get'

Thanks, Karan Shingde

tomaarsen commented 6 months ago

Hello!

I've narrowed the issue down to this line:

val_data = data['validation'].select(range(100, 30))

If I print that dataset, I get:

Dataset({
    features: ['sentence1', 'sentence2', 'score'],
    num_rows: 0
})

I would recommend updating your code to:

val_data = data['validation'].select(range(100, 130))

and then training works like expected!

karan842 commented 6 months ago

This is working on Kaggle but when I am running locally, Running with:

accelerate launch --multi-gpu --num_processes=4 main.py

It is saying,

TypeError: SentenceTransformerTrainingArguments.__init__() got an unexpected keyword argument 'eval_strategy'   

I tried with evaluation_strategy and now getting this error:

trainer.train()trainer.train()

  File "/data/projects/llm-env/lib/python3.10/site-packages/transformers/trainer.py", line 1780, in train
  File "/data/projects/llm-env/lib/python3.10/site-packages/transformers/trainer.py", line 1780, in train
        return inner_training_loop(return inner_training_loop(

  File "/data/projects/llm-env/lib/python3.10/site-packages/transformers/trainer.py", line 2193, in _inner_training_loop
  File "/data/projects/llm-env/lib/python3.10/site-packages/transformers/trainer.py", line 2193, in _inner_training_loop
    self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)    
self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
  File "/data/projects/llm-env/lib/python3.10/site-packages/transformers/trainer.py", line 2577, in _maybe_log_save_evaluate
  File "/data/projects/llm-env/lib/python3.10/site-packages/transformers/trainer.py", line 2577, in _maybe_log_save_evaluate
    metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)    
metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
  File "/data/projects/llm-env/lib/python3.10/site-packages/sentence_transformers/trainer.py", line 369, in evaluate
  File "/data/projects/llm-env/lib/python3.10/site-packages/sentence_transformers/trainer.py", line 369, in evaluate
        return super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)return super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)

  File "/data/projects/llm-env/lib/python3.10/site-packages/transformers/trainer.py", line 3365, in evaluate
  File "/data/projects/llm-env/lib/python3.10/site-packages/transformers/trainer.py", line 3365, in evaluate
        output = eval_loop(output = eval_loop(

  File "/data/projects/llm-env/lib/python3.10/site-packages/sentence_transformers/trainer.py", line 379, in evaluation_loop
  File "/data/projects/llm-env/lib/python3.10/site-packages/sentence_transformers/trainer.py", line 379, in evaluation_loop
        output = super().evaluation_loop(output = super().evaluation_loop(

  File "/data/projects/llm-env/lib/python3.10/site-packages/transformers/trainer.py", line 3554, in evaluation_loop
  File "/data/projects/llm-env/lib/python3.10/site-packages/transformers/trainer.py", line 3554, in evaluation_loop
    loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
  File "/data/projects/llm-env/lib/python3.10/site-packages/transformers/trainer.py", line 3728, in prediction_step
    loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
  File "/data/projects/llm-env/lib/python3.10/site-packages/transformers/trainer.py", line 3728, in prediction_step
    return_loss = inputs.get("return_loss", None)
AttributeError: 'NoneType' object has no attribute 'get'    
return_loss = inputs.get("return_loss", None)
AttributeError: 'NoneType' object has no attribute 'get'

My dependencies are:

Name: accelerate Version: 0.29.2

Name: transformers Version: 4.39.3

Name: sentence-transformers Version: 3.0.0

tomaarsen commented 6 months ago

The evaluation_strategy fix was indeed necessary because older versions of transformers do not yet support the new name of eval_strategy (they renamed it).

On Kaggle, can you please print the dataset that you're feeding to eval_dataset in the Trainer? I think it's empty.

karan842 commented 6 months ago

I got it I was giving very less amount of data range and running on 4 GPUs. I increased the data size, For train_data=200. val_data=90

Now working on multi-gpu

tomaarsen commented 6 months ago

Oh, it is indeed possible that using less samples than the number of GPUs causes this issue, even if the number of evaluation samples isn't strictly 0.

karan842 commented 6 months ago

What Evaluator can we use for (anchor, positive) type of data?

My task is embedding similarity, but it needed label score

tomaarsen commented 6 months ago

You could use InformationRetrievalEvaluator:

queries = dict(enumerate(dataset["anchor"]))
corpus = dict(enumerate(dataset["positive"]))
relevant_docs = {idx: idx for idx in range(len(dataset))}

ir_evaluator = InformationRetrievalEvaluator(
    queries=queries,
    corpus=corpus,
    relevant_docs=relevant_docs,
    name="...",
)

Here the "positive" is the relevant document and all other texts are seen as not relevant.

You can get roughly the same behaviour with the TranslationEvaluator:

evaluator = TranslationEvaluator(
    source_sentences=dataset["anchor"],
    target_sentences=dataset["positive"],
    name="...",
)

This computes what percentage of time the anchor at index $i$ is most similar to the positive at index $i$ out of all positive texts. The bigger the dataset, the harder both of these tasks get.

karan842 commented 6 months ago

Cool! Thanks

Lastly what is best way to store model locally?

save_safetensors or save_only_model

I just need model locally which I will load into another script for testing

tomaarsen commented 6 months ago

model.save_pretrained("local_path") is the recommended way.

karan842 commented 6 months ago
  1. Do we need to mention DataLoader and collate_fn explicitly for lazy loading?

  2. Are these arguments, doing same thing? per_device_train_batch_size=64, per_device_eval_batch_size=64,

  3. I am training on huge data where number of records in train and validation are hundreds of millions. I have 4 GPU 15 GB each. I am new to large data training in prod. How can I run on this machine configuration?

Final Code:

def main():
    data_files = split_data(data_dir)
    # print(data_files)
    data = load_dataset("parquet", data_dir=data_dir, data_files=data_files)
    train_data, val_data = data['train'], data['val']
    print(train_data,"\n\n")
    print(val_data,"\n\n")

    word_embedding_model = models.Transformer('sentence-transformers/all-MiniLM-L6-v2')
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    loss = losses.MultipleNegativesRankingLoss(model)

    training_args = SentenceTransformerTrainingArguments(
        output_dir='sbert-output-dir',
        num_train_epochs=1000,
        seed=33,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        learning_rate=2e-5,
        warmup_ratio=0.2,
        save_only_model=True,
        fp16=True,
        evaluation_strategy="steps",
        eval_steps=50,
        save_total_limit=50,
        load_best_model_at_end=True,
        metric_for_best_model='spearman_cosine',
        greater_is_better=True

    )  

    trainer = SentenceTransformerTrainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        loss=loss
    )
    trainer.train()
    model.save_pretrained('sbert-model')

if __name__ == '__main__':
    main()