[BUG] RuntimeError: Tensors must be contiguous error while finetuning with deepspeed.

microsoft / DeepSpeed

DeepSpeed is a deep learning optimization library that makes distributed training and inference easy, efficient, and effective.

https://www.deepspeed.ai/

Apache License 2.0

35.68k stars 4.15k forks source link

[BUG] RuntimeError: Tensors must be contiguous error while finetuning with deepspeed. #2736

Open FahriBilici opened 1 year ago

FahriBilici commented 1 year ago

I am just trying to fine-tune "EleutherAI/gpt-neo-1.3B" for casualLM on google colab. Without anything, it gives out of memory error. I was checking what can I do and I found deepspeed. I added deepspeed='ds_config.json', to my training arguments in jupyter notebook and used configuration from the official page "ds_config_zero2.json". After that, I start to get this error. I am trying to do it in the notebook, not as a command.

To Reproduce try fine-tuning gpt-neo

This is the full error

The following columns in the training set don't have a corresponding argument in `GPTNeoForCausalLM.forward` and have been ignored: text. If text are not expected by `GPTNeoForCausalLM.forward`,  you can safely ignore this message.
[2023-01-23 12:41:08,453] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.8.0, git-hash=unknown, git-branch=unknown
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
[<ipython-input-21-3435b262f1ae>](https://localhost:8080/#) in <module>
----> 1 trainer.train()

10 frames
[/usr/local/lib/python3.8/dist-packages/transformers/trainer.py](https://localhost:8080/#) in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1525             self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
   1526         )
-> 1527         return inner_training_loop(
   1528             args=args,
   1529             resume_from_checkpoint=resume_from_checkpoint,

[/usr/local/lib/python3.8/dist-packages/transformers/trainer.py](https://localhost:8080/#) in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   1594         )
   1595         if args.deepspeed:
-> 1596             deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
   1597                 self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint
   1598             )

[/usr/local/lib/python3.8/dist-packages/transformers/deepspeed.py](https://localhost:8080/#) in deepspeed_init(trainer, num_training_steps, resume_from_checkpoint, inference)
    342     )
    343 
--> 344     deepspeed_engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
    345 
    346     if resume_from_checkpoint is not None:

[/usr/local/lib/python3.8/dist-packages/deepspeed/__init__.py](https://localhost:8080/#) in initialize(args, model, optimizer, model_parameters, training_data, lr_scheduler, mpu, dist_init_required, collate_fn, config, config_params)
    123 
    124     if not isinstance(model, PipelineModule):
--> 125         engine = DeepSpeedEngine(args=args,
    126                                  model=model,
    127                                  optimizer=optimizer,

[/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py](https://localhost:8080/#) in __init__(self, args, model, optimizer, model_parameters, training_data, lr_scheduler, mpu, dist_init_required, collate_fn, config, config_params, dont_change_device)
    299 
    300         # Configure distributed model
--> 301         self._configure_distributed_model(model)
    302 
    303         self._get_model_parameters()

[/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py](https://localhost:8080/#) in _configure_distributed_model(self, model)
   1185 
   1186         if not self.amp_enabled():
-> 1187             self._broadcast_model()
   1188 
   1189     # check if parameters are duplicated in optimizer param_groups

[/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py](https://localhost:8080/#) in _broadcast_model(self)
   1100             else:
   1101                 if torch.is_tensor(p) and is_replicated(p):
-> 1102                     dist.broadcast(p,
   1103                                    groups._get_broadcast_src_rank(),
   1104                                    group=self.data_parallel_group)

[/usr/local/lib/python3.8/dist-packages/deepspeed/comm/comm.py](https://localhost:8080/#) in log_wrapper(*args, **kwargs)
    125         # Return the op, then stop the op's timer
    126         try:
--> 127             return func(*args, **kwargs)
    128         finally:
    129             if comms_logger.enabled:

[/usr/local/lib/python3.8/dist-packages/deepspeed/comm/comm.py](https://localhost:8080/#) in broadcast(tensor, src, group, async_op, prof, log_name, debug)
    230               debug=get_caller_func()):
    231     global cdb
--> 232     return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
    233 
    234 

[/usr/local/lib/python3.8/dist-packages/deepspeed/comm/torch.py](https://localhost:8080/#) in broadcast(self, tensor, src, group, async_op)
     68 
     69     def broadcast(self, tensor, src, group=None, async_op=False):
---> 70         return torch.distributed.broadcast(tensor=tensor,
     71                                            src=src,
     72                                            group=group,

[/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py](https://localhost:8080/#) in broadcast(tensor, src, group, async_op)
   1402         group_src_rank = get_group_rank(group, src)
   1403         opts.rootRank = group_src_rank
-> 1404         work = group.broadcast([tensor], opts)
   1405     if async_op:
   1406         return work

RuntimeError: Tensors must be contiguous

ds_report output

DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
 [WARNING]  async_io requires the dev libaio .so object and headers but these were not found.
 [WARNING]  async_io: please install the libaio-dev package with apt
 [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
async_io ............... [NO] ....... [NO]
cpu_adagrad ............ [NO] ....... [OKAY]
cpu_adam ............... [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
random_ltd ............. [NO] ....... [OKAY]
 [WARNING]  please install triton==1.0.0 if you want to use sparse attention
sparse_attn ............ [NO] ....... [NO]
spatial_inference ...... [NO] ....... [OKAY]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
utils .................. [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/usr/local/lib/python3.8/dist-packages/torch']
torch version .................... 1.13.1+cu116
deepspeed install path ........... ['/usr/local/lib/python3.8/dist-packages/deepspeed']
deepspeed info ................... 0.8.0, unknown, unknown
torch cuda version ............... 11.6
torch hip version ................ None
nvcc version ..................... 11.2
deepspeed wheel compiled w. ...... torch 1.13, cuda 11.6

System info (please complete the following information): google colab

Launcher context Are you launching your experiment with the deepspeed launcher, MPI, or something else?

Docker context Are you using a specific docker image that you can share?

Additional context Add any other context about the problem here.

GuanhuaWang commented 1 year ago

Hi @FahriBilici , thx for raising this issue.

In order to reproduce the error, could you also provide the training script you ran and command line (e.g., either use deepspeed or pytorch launcher) for running your training script?

GuanhuaWang commented 1 year ago

closed for now, feel free to reopen if needed.

FahriBilici commented 1 year ago

I am using huggingface trainer class for finetuning. It works on jupyter notebook not directly training script. how should I share my notebook?

chenmingjiong commented 1 year ago

@GuanhuaWang I got the same problem when finetuning "EleutherAI/gpt-j-6B" using LoRA on 8×2080ti. Exactly the same error log as above. Reproduce the error: clone this repo: https://github.com/CarperAI/trlx modify the script: examples/summarize_rlhf/sft/train_gptj_summarize.py

import random
import os
import evaluate
import numpy as np
import torch
import torch.nn as nn
from peft import LoraConfig, get_peft_model 
from summarize_dataset import TLDRDataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    default_data_collator,
)

def set_seed(seed_val=42):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

if __name__ == "__main__":
    output_dir = "gptj-supervised-summarize-checkpoint"
    train_batch_size = 16
    gradient_accumulation_steps = 1
    learning_rate = 1e-5
    eval_batch_size = 1
    eval_steps = 500
    max_input_length = 550
    save_steps = 1000
    num_train_epochs = 5
    random.seed(42)
    os.environ["WANDB_DISABLED"] = "true"

    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
    model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", use_cache=False, load_in_8bit=True, device_map='auto')
    tokenizer.pad_token = tokenizer.eos_token
    model.resize_token_embeddings(len(tokenizer))
    tokenizer.pad_token_id = tokenizer.eos_token_id
    model.config.end_token_id = tokenizer.eos_token_id
    model.config.pad_token_id = model.config.eos_token_id

    for param in model.parameters():
        param.requires_grad = False  # freeze the model - train adapters later
        if param.ndim == 1:
            # cast the small parameters (e.g. layernorm) to fp32 for stability
            param.data = param.data.to(torch.float32)
    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()

    class CastOutputToFloat(nn.Sequential):
        def forward(self, x): return super().forward(x).to(torch.float32)
    model.lm_head = CastOutputToFloat(model.lm_head)

    config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )

    model = get_peft_model(model, config)

    # Set up the datasets
    data_path = "CarperAI/openai_summarize_tldr"
    train_dataset = TLDRDataset(
        data_path,
        tokenizer,
        "train",
        max_length=max_input_length,
    )
    dev_dataset = TLDRDataset(
        data_path,
        tokenizer,
        "valid",
        max_length=max_input_length,
    )

    # Set up the metric
    rouge = evaluate.load("rouge")

    def compute_metrics(eval_preds):
        labels_ids = eval_preds.label_ids
        pred_ids = eval_preds.predictions
        pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
        result = rouge.compute(predictions=pred_str, references=label_str)
        return result

    # Create a preprocessing function to extract out the proper logits from the model output
    def preprocess_logits_for_metrics(logits, labels):
        if isinstance(logits, tuple):
            logits = logits[0]
        return logits.argmax(dim=-1)

    # Prepare the trainer and start training
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="steps",
        eval_accumulation_steps=1,
        learning_rate=learning_rate,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        gradient_checkpointing=True,
        half_precision_backend="auto",
        fp16=True,
        adam_beta1=0.9,
        adam_beta2=0.95,
        gradient_accumulation_steps=gradient_accumulation_steps,
        num_train_epochs=num_train_epochs,
        warmup_steps=100,
        eval_steps=eval_steps,
        save_steps=save_steps,
        load_best_model_at_end=True,
        logging_steps=50,
        deepspeed="examples/summarize_rlhf/sft/ds_config_gptj.json",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        compute_metrics=compute_metrics,
        data_collator=default_data_collator,
        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    )
    trainer.train()
    trainer.save_model(output_dir)

and run: deepspeed examples/summarize_rlhf/sft/train_gptj_summarize.py

FarzanT commented 1 year ago

Hi, I first raised this issue on the pytorch repo: https://github.com/pytorch/pytorch/issues/94907#issue-1586135480.

It was suggested that DeepSpeed should ensure that the tensors it passes to torch.distributed are contiguous. I fixed the issue by manually changing the following line in distributed_c10d.py:

https://github.com/pytorch/pytorch/blob/3ace14eb8b5e437322acf962d2f170561fd4e3bc/torch/distributed/distributed_c10d.py#L1555

I basically force the tensors to be contiguous, e.g.:

work = group.broadcast([tensor.contiguous()], opts)

However, this may result in some unexpected behavior as apparently calling .contiguous recreates the tensor (?).

sabetAI commented 1 year ago

I'm experiencing the same issue.

wangshanyw commented 1 year ago

I'm experiencing the same issue.

djaym7 commented 1 year ago

same issue with gpt models on HF

thinhlpg commented 1 year ago

Hello, I just faced the same issue. I found out that the problem lies in the device_map argument of Hugging Face's AutoModel... classes. Changing the argument from device_map="auto" to device_map=None fixed the issue for me! I hope this help!

FarzanT commented 1 year ago

Hi, any updates on this? I'm using the DeepSpeed integration in PyTorch Lightning and haven't been able to resolve this except for the hack I've mentioned above:

work = group.broadcast([tensor.contiguous()], opts)

opyate commented 1 year ago

    deepspeed="examples/summarize_rlhf/sft/ds_config_gptj.json",

@chenmingjiong Did you modify the json file? If so, please paste the changes here.

opyate commented 1 year ago

Hello, I just faced the same issue. I found out that the problem lies in the device_map argument of Hugging Face's AutoModel... classes. Changing the argument from device_map="auto" to device_map=None fixed the issue for me! I hope this help!

@thinhlpg I think this loads the model to CPU instead of CUDA.

chenmingjiong commented 1 year ago

@chenmingjiong Did you modify the json file? If so, please paste the changes here.

No. I use the original file.

memray commented 1 year ago

Same issue. Same code works for some models, but reports this error for some others.


╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /export/home/project/codeai/codeai_autocomplete/finetune/train_deepspeed.py: │
│ 339 in <module>                                                              │
│                                                                              │
│   336                                                                        │
│   337                                                                        │
│   338 if __name__ == '__main__':                                             │
│ ❱ 339 │   main()                                                             │
│                                                                              │
│ /export/home/project/codeai/codeai_autocomplete/finetune/train_deepspeed.py: │
│ 335 in main                                                                  │
│                                                                              │
│   332 │   copy_source(__file__, args.output_dir)                             │
│   333 │                                                                      │
│   334 │   # train                                                            │
│ ❱ 335 │   train(args=args)                                                   │
│   336                                                                        │
│   337                                                                        │
│   338 if __name__ == '__main__':                                             │
│                                                                              │
│ /export/home/project/codeai/codeai_autocomplete/finetune/train_deepspeed.py: │
│ 210 in train                                                                 │
│                                                                              │
│   207 │   ## deepspeed                                                       │
│   208 │   with print_time('Initializing deepspeed'):                         │
│   209 │   │   model_parameters = list(filter(lambda p: p.requires_grad, mode │
│ ❱ 210 │   │   model_engine, optimizer, _, _ = deepspeed.initialize(config=ar │
│   211 │   │   torch.cuda.empty_cache()                                       │
│   212 │                                                                      │
│   213 │   #######################                                            │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/__init__.py:165 in initialize                                      │
│                                                                              │
│   162 │   │   │   │   │   │   │   │   │   │      config=config,              │
│   163 │   │   │   │   │   │   │   │   │   │      config_class=config_class)  │
│   164 │   │   else:                                                          │
│ ❱ 165 │   │   │   engine = DeepSpeedEngine(args=args,                        │
│   166 │   │   │   │   │   │   │   │   │    model=model,                      │
│   167 │   │   │   │   │   │   │   │   │    optimizer=optimizer,              │
│   168 │   │   │   │   │   │   │   │   │    model_parameters=model_parameters │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/runtime/engine.py:266 in __init__                                  │
│                                                                              │
│    263 │   │   self.pipeline_parallelism = isinstance(model, PipelineModule) │
│    264 │   │                                                                 │
│    265 │   │   # Configure distributed model                                 │
│ ❱  266 │   │   self._configure_distributed_model(model)                      │
│    267 │   │                                                                 │
│    268 │   │   self._get_model_parameters()                                  │
│    269                                                                       │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/runtime/engine.py:1073 in _configure_distributed_model             │
│                                                                              │
│   1070 │   │   self.expert_data_parallel_group = groups._get_expert_data_par │
│   1071 │   │                                                                 │
│   1072 │   │   if not self.amp_enabled():                                    │
│ ❱ 1073 │   │   │   self._broadcast_model()                                   │
│   1074 │                                                                     │
│   1075 │   # check if parameters are duplicated in optimizer param_groups    │
│   1076 │   def _check_for_duplicates(self, optimizer):                       │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/runtime/engine.py:1003 in _broadcast_model                         │
│                                                                              │
│   1000 │   │   │   │   │   │   │   │      group=self.expert_data_parallel_gr │
│   1001 │   │   │   else:                                                     │
│   1002 │   │   │   │   if torch.is_tensor(p) and is_replicated(p):           │
│ ❱ 1003 │   │   │   │   │   dist.broadcast(p, groups._get_broadcast_src_rank( │
│   1004 │                                                                     │
│   1005 │   @staticmethod                                                     │
│   1006 │   def __check_params(model: Module, dtype: torch.dtype) -> None:    │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/comm/comm.py:120 in log_wrapper                                    │
│                                                                              │
│   117 │   │   │   │   timers(log_name).start()                               │
│   118 │   │   # Return the op, then stop the op's timer                      │
│   119 │   │   try:                                                           │
│ ❱ 120 │   │   │   return func(*args, **kwargs)                               │
│   121 │   │   finally:                                                       │
│   122 │   │   │   if comms_logger.enabled:                                   │
│   123 │   │   │   │   # Need to make op blocking for accurate logging        │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/comm/comm.py:217 in broadcast                                      │
│                                                                              │
│   214 @timed_op                                                              │
│   215 def broadcast(tensor, src, group=None, async_op=False, prof=False, log │
│   216 │   global cdb                                                         │
│ ❱ 217 │   return cdb.broadcast(tensor=tensor, src=src, group=group, async_op │
│   218                                                                        │
│   219                                                                        │
│   220 @timed_op                                                              │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/comm/torch.py:118 in broadcast                                     │
│                                                                              │
│   115 │   │   │   │   │   │   │   │   │   │   │   │   async_op=async_op)     │
│   116 │                                                                      │
│   117 │   def broadcast(self, tensor, src, group=None, async_op=False):      │
│ ❱ 118 │   │   return torch.distributed.broadcast(tensor=tensor, src=src, gro │
│   119 │                                                                      │
│   120 │   def all_gather(self, tensor_list, tensor, group=None, async_op=Fal │
│   121 │   │   return torch.distributed.all_gather(tensor_list=tensor_list, t │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ torch/distributed/distributed_c10d.py:1451 in wrapper                        │
│                                                                              │
│   1448 │   @functools.wraps(func)                                            │
│   1449 │   def wrapper(*args, **kwargs):                                     │
│   1450 │   │   try:                                                          │
│ ❱ 1451 │   │   │   return func(*args, **kwargs)                              │
│   1452 │   │   except Exception as error:                                    │
│   1453 │   │   │   if is_initialized():                                      │
│   1454 │   │   │   │   error_msg_dict = {                                    │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ torch/distributed/distributed_c10d.py:1570 in broadcast                      │
│                                                                              │
│   1567 │   else:                                                             │
│   1568 │   │   group_src_rank = get_group_rank(group, src)                   │
│   1569 │   │   opts.rootRank = group_src_rank                                │
│ ❱ 1570 │   │   work = group.broadcast([tensor], opts)                        │
│   1571 │   if async_op:                                                      │
│   1572 │   │   return work                                                   │
│   1573 │   else:                                                             │
╰──────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Tensors must be contiguous

memray commented 1 year ago

In line with @FarzanT 's comment, you may try make this change (comm.py L214) within deepspeed to minimize the risk.

It's working but I need some time to check if the learning curve makes sense.

# deepspeed/comm/comm.py
@timed_op
def broadcast(tensor, src, group=None, async_op=False, prof=False, log_name='broadcast', debug=get_caller_func()):                         
    global cdb
    if not tensor.is_contiguous():
        tensor = tensor.contiguous()
    return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)

KeeratKG commented 1 year ago

Same issue. Same code works for some models, but reports this error for some others.


╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /export/home/project/codeai/codeai_autocomplete/finetune/train_deepspeed.py: │
│ 339 in <module>                                                              │
│                                                                              │
│   336                                                                        │
│   337                                                                        │
│   338 if __name__ == '__main__':                                             │
│ ❱ 339 │   main()                                                             │
│                                                                              │
│ /export/home/project/codeai/codeai_autocomplete/finetune/train_deepspeed.py: │
│ 335 in main                                                                  │
│                                                                              │
│   332 │   copy_source(__file__, args.output_dir)                             │
│   333 │                                                                      │
│   334 │   # train                                                            │
│ ❱ 335 │   train(args=args)                                                   │
│   336                                                                        │
│   337                                                                        │
│   338 if __name__ == '__main__':                                             │
│                                                                              │
│ /export/home/project/codeai/codeai_autocomplete/finetune/train_deepspeed.py: │
│ 210 in train                                                                 │
│                                                                              │
│   207 │   ## deepspeed                                                       │
│   208 │   with print_time('Initializing deepspeed'):                         │
│   209 │   │   model_parameters = list(filter(lambda p: p.requires_grad, mode │
│ ❱ 210 │   │   model_engine, optimizer, _, _ = deepspeed.initialize(config=ar │
│   211 │   │   torch.cuda.empty_cache()                                       │
│   212 │                                                                      │
│   213 │   #######################                                            │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/__init__.py:165 in initialize                                      │
│                                                                              │
│   162 │   │   │   │   │   │   │   │   │   │      config=config,              │
│   163 │   │   │   │   │   │   │   │   │   │      config_class=config_class)  │
│   164 │   │   else:                                                          │
│ ❱ 165 │   │   │   engine = DeepSpeedEngine(args=args,                        │
│   166 │   │   │   │   │   │   │   │   │    model=model,                      │
│   167 │   │   │   │   │   │   │   │   │    optimizer=optimizer,              │
│   168 │   │   │   │   │   │   │   │   │    model_parameters=model_parameters │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/runtime/engine.py:266 in __init__                                  │
│                                                                              │
│    263 │   │   self.pipeline_parallelism = isinstance(model, PipelineModule) │
│    264 │   │                                                                 │
│    265 │   │   # Configure distributed model                                 │
│ ❱  266 │   │   self._configure_distributed_model(model)                      │
│    267 │   │                                                                 │
│    268 │   │   self._get_model_parameters()                                  │
│    269                                                                       │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/runtime/engine.py:1073 in _configure_distributed_model             │
│                                                                              │
│   1070 │   │   self.expert_data_parallel_group = groups._get_expert_data_par │
│   1071 │   │                                                                 │
│   1072 │   │   if not self.amp_enabled():                                    │
│ ❱ 1073 │   │   │   self._broadcast_model()                                   │
│   1074 │                                                                     │
│   1075 │   # check if parameters are duplicated in optimizer param_groups    │
│   1076 │   def _check_for_duplicates(self, optimizer):                       │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/runtime/engine.py:1003 in _broadcast_model                         │
│                                                                              │
│   1000 │   │   │   │   │   │   │   │      group=self.expert_data_parallel_gr │
│   1001 │   │   │   else:                                                     │
│   1002 │   │   │   │   if torch.is_tensor(p) and is_replicated(p):           │
│ ❱ 1003 │   │   │   │   │   dist.broadcast(p, groups._get_broadcast_src_rank( │
│   1004 │                                                                     │
│   1005 │   @staticmethod                                                     │
│   1006 │   def __check_params(model: Module, dtype: torch.dtype) -> None:    │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/comm/comm.py:120 in log_wrapper                                    │
│                                                                              │
│   117 │   │   │   │   timers(log_name).start()                               │
│   118 │   │   # Return the op, then stop the op's timer                      │
│   119 │   │   try:                                                           │
│ ❱ 120 │   │   │   return func(*args, **kwargs)                               │
│   121 │   │   finally:                                                       │
│   122 │   │   │   if comms_logger.enabled:                                   │
│   123 │   │   │   │   # Need to make op blocking for accurate logging        │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/comm/comm.py:217 in broadcast                                      │
│                                                                              │
│   214 @timed_op                                                              │
│   215 def broadcast(tensor, src, group=None, async_op=False, prof=False, log │
│   216 │   global cdb                                                         │
│ ❱ 217 │   return cdb.broadcast(tensor=tensor, src=src, group=group, async_op │
│   218                                                                        │
│   219                                                                        │
│   220 @timed_op                                                              │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ deepspeed/comm/torch.py:118 in broadcast                                     │
│                                                                              │
│   115 │   │   │   │   │   │   │   │   │   │   │   │   async_op=async_op)     │
│   116 │                                                                      │
│   117 │   def broadcast(self, tensor, src, group=None, async_op=False):      │
│ ❱ 118 │   │   return torch.distributed.broadcast(tensor=tensor, src=src, gro │
│   119 │                                                                      │
│   120 │   def all_gather(self, tensor_list, tensor, group=None, async_op=Fal │
│   121 │   │   return torch.distributed.all_gather(tensor_list=tensor_list, t │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ torch/distributed/distributed_c10d.py:1451 in wrapper                        │
│                                                                              │
│   1448 │   @functools.wraps(func)                                            │
│   1449 │   def wrapper(*args, **kwargs):                                     │
│   1450 │   │   try:                                                          │
│ ❱ 1451 │   │   │   return func(*args, **kwargs)                              │
│   1452 │   │   except Exception as error:                                    │
│   1453 │   │   │   if is_initialized():                                      │
│   1454 │   │   │   │   error_msg_dict = {                                    │
│                                                                              │
│ /export/share/ruimeng/env/anaconda/envs/codegen/lib/python3.8/site-packages/ │
│ torch/distributed/distributed_c10d.py:1570 in broadcast                      │
│                                                                              │
│   1567 │   else:                                                             │
│   1568 │   │   group_src_rank = get_group_rank(group, src)                   │
│   1569 │   │   opts.rootRank = group_src_rank                                │
│ ❱ 1570 │   │   work = group.broadcast([tensor], opts)                        │
│   1571 │   if async_op:                                                      │
│   1572 │   │   return work                                                   │
│   1573 │   else:                                                             │
╰──────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Tensors must be contiguous

Hi. What models does this end up working for, based on your experience?

memray commented 1 year ago

@KeeratKG Ah sorry I don't recall, should be either huggyllama/llama-7b or Salesforce/codegen2-7B.

Jethro85 commented 1 year ago

Facing the same problem,

Traceback (most recent call last):
  File "fine_tune.py", line 191, in <module>
    trainer.train()
  File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/transformers/trainer.py", line 1662, in train
    return inner_training_loop(
  File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/transformers/trainer.py", line 1731, in _inner_training_loop
    deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
  File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/transformers/deepspeed.py", line 378, in deepspeed_init
    deepspeed_engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
  File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/deepspeed/__init__.py", line 171, in initialize
    engine = DeepSpeedEngine(args=args,
  File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 268, in __init__
    self._configure_distributed_model(model)
  File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1109, in _configure_distributed_model
    self._broadcast_model()
  File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1033, in _broadcast_model
    dist.broadcast(p, groups._get_broadcast_src_rank(), group=self.data_parallel_group)
  File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/deepspeed/comm/comm.py", line 116, in log_wrapper
    return func(*args, **kwargs)
  File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/deepspeed/comm/comm.py", line 216, in broadcast
    return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
  File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/deepspeed/comm/torch.py", line 188, in broadcast
    return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
  File "/home/shy23010/anaconda3/envs/myenv/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1404, in broadcast
    work = group.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous

Arij-Aladel commented 1 year ago

facing th same issue @pacman100

Arij-Aladel commented 1 year ago

    if not tensor.is_contiguous():
        tensor = tensor.contiguous()

This is my accelerate config

compute_environment: LOCAL_MACHINE
deepspeed_config:
  offload_optimizer_device: cpu
  gradient_clipping: 1.0
  zero_stage: 2
distributed_type: DEEPSPEED
downcast_bf16: 'no'
fsdp_config: {}
machine_rank: 0
main_training_function: main
main_process_port: 20680
mixed_precision: 'no'
num_machines: 1
num_processes: 1
rdzv_backend: static
same_network: true
use_cpu: false

@pacman100 I have tried this solution but I got another one)


09/26/2023 18:32:51 - INFO - root - Train ataset length :12008
09/26/2023 18:32:51 - INFO - __main__ - Sample 10476 of the training set: {'input_ids': [32100, 125, 
5, 651, 5, 1]}.
09/26/2023 18:32:51 - INFO - accelerate.accelerator - Updating DeepSpeed's gradient accumulation step
piled!
Building extension module cpu_adam...
t-hash=unknown, git-branch=unknown
09/26/2023 18:33:01 - INFO - torch.distributed.distributed_c10d - Added key: store_based_barrier_key:
    main()
 1213, in _configure_optimizer
    self.optimizer = self._configure_zero_optimizer(basic_optimizer)
  File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1467, in _configure_zero_optimizer
    optimizer = DeepSpeedZeroOptimizer(
  File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 337, in __init__
    self.flatten_dense_tensors_aligned(
  File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 889, in flatten_dense_tensors_aligned
    return self.flatten(align_dense_tensors(tensor_list, alignment))
  File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/torch/_utils.py", line 459, in _flatten_dense_tensors
    return torch._C._nn.flatten_dense_tensors(tensors)
RuntimeError: torch.cat(): expected a non-empty list of Tensors
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 3608256) of binary: /mnt/ssd/arij/NeurIPS/NeurIPSS/bin/python3.9
Traceback (most recent call last):
  File "/mnt/ssd/arij/NeurIPS/NeurIPSS/bin/accelerate", line 8, in <module>
    sys.exit(main())
  File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
    args.func(args)
  File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/accelerate/commands/launch.py", line 964, in launch_command
    deepspeed_launcher(args)
  File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/accelerate/commands/launch.py", line 687, in deepspeed_launcher
    distrib_run.run(args)
  File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/torch/distributed/run.py", line 785, in run
    elastic_launch(
  File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/mnt/ssd/arij/NeurIPS/NeurIPSS/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
train_seqtoseq_dolly_PEFT_24_9_2023.py FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2023-09-26_18:33:06
  host      : srv-dgx02
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 3608256)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================

dp_report

Github2

YueChenkkk commented 1 year ago

facing the same issue.

my code works fine with flan-t5 but raises this error with t5-base

Traceback (most recent call last):
  File "run_question_answering.py", line 898, in <module>
    main()
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
    return f(*args, **kwargs)
  File "run_question_answering.py", line 834, in main
    train_result = trainer.train(resume_from_checkpoint=checkpoint)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/transformers/trainer.py", line 1539, in train
    return inner_training_loop(
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/transformers/trainer.py", line 1656, in _inner_training_loop
    model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/accelerate/accelerator.py", line 1198, in prepare
    result = self._prepare_deepspeed(*args)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/accelerate/accelerator.py", line 1537, in _prepare_deepspeed
    engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/__init__.py", line 171, in initialize
    engine = DeepSpeedEngine(args=args,
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 268, in __init__
    self._configure_distributed_model(model)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1109, in _configure_distributed_model
    self._broadcast_model()
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1033, in _broadcast_model
    dist.broadcast(p, groups._get_broadcast_src_rank(), group=self.data_parallel_group)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/comm/comm.py", line 116, in log_wrapper
    return func(*args, **kwargs)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/comm/comm.py", line 216, in broadcast
    return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/comm/torch.py", line 188, in broadcast
    return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1451, in wrapper
    return func(*args, **kwargs)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1570, in broadcast
    work = group.broadcast([tensor], opts)

YueChenkkk commented 1 year ago

facing the same issue.

my code works fine with flan-t5 but raises this error with t5-base

Traceback (most recent call last):
  File "run_question_answering.py", line 898, in <module>
    main()
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
    return f(*args, **kwargs)
  File "run_question_answering.py", line 834, in main
    train_result = trainer.train(resume_from_checkpoint=checkpoint)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/transformers/trainer.py", line 1539, in train
    return inner_training_loop(
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/transformers/trainer.py", line 1656, in _inner_training_loop
    model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/accelerate/accelerator.py", line 1198, in prepare
    result = self._prepare_deepspeed(*args)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/accelerate/accelerator.py", line 1537, in _prepare_deepspeed
    engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/__init__.py", line 171, in initialize
    engine = DeepSpeedEngine(args=args,
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 268, in __init__
    self._configure_distributed_model(model)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1109, in _configure_distributed_model
    self._broadcast_model()
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1033, in _broadcast_model
    dist.broadcast(p, groups._get_broadcast_src_rank(), group=self.data_parallel_group)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/comm/comm.py", line 116, in log_wrapper
    return func(*args, **kwargs)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/comm/comm.py", line 216, in broadcast
    return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/deepspeed/comm/torch.py", line 188, in broadcast
    return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1451, in wrapper
    return func(*args, **kwargs)
  File "/home/ychen/anaconda3/envs/py38/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1570, in broadcast
    work = group.broadcast([tensor], opts)

I tried to run with torchrun instead of deepspeed, things went fine

jungyh0218 commented 1 year ago

Hello, I just faced the same issue. I found out that the problem lies in the device_map argument of Hugging Face's AutoModel... classes. Changing the argument from device_map="auto" to device_map=None fixed the issue for me! I hope this help!

device_map="auto" is not compatible with DeepSpeed, thus it is necessary to remove this option or change it into None. However, this is totally irrelevant to the Tensor must be contiguous error. Using device_map="auto" option causes Expected all tensors to be on the same device... error. (Github)

Muttermal commented 10 months ago

facing th same issue.

Traceback (most recent call last):
  File "/home/nlp/zgy/VLM/src/train/train_mem.py", line 12, in <module>
    train()
  File "/home/nlp/zgy/VLM/src/train/train.py", line 395, in train
    trainer.train()
  File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/trainer.py", line 1537, in train
    return inner_training_loop(
  File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/trainer.py", line 1675, in _inner_training_loop
    model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
  File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/accelerate/accelerator.py", line 1209, in prepare
    result = self._prepare_deepspeed(*args)
  File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/accelerate/accelerator.py", line 1582, in _prepare_deepspeed
    engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
  File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/deepspeed/__init__.py", line 171, in initialize
    engine = DeepSpeedEngine(args=args,
  File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 263, in __init__
    self._configure_distributed_model(model)
  File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1148, in _configure_distributed_model
    self._broadcast_model()
  File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1068, in _broadcast_model
    dist.broadcast(p, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
  File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
    return func(*args, **kwargs)
  File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 224, in broadcast
    return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
  File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/deepspeed/comm/torch.py", line 196, in broadcast
    return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
  File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
    return func(*args, **kwargs)
  File "/home/nlp/miniconda3/envs/llm/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1910, in broadcast
    work = group.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous

I have tried this solution and this solution, neither worked.