Lightning-AI / pytorch-lightning

Pretrain, finetune ANY AI model of ANY size on multiple GPUs, TPUs with zero code changes.
https://lightning.ai
Apache License 2.0
28.34k stars 3.38k forks source link

Device error when loading from checkpoint for testing with deepspeed #18478

Open dionman opened 1 year ago

dionman commented 1 year ago

Bug description

I'm receiving RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:2 and cpu! (when checking argument for argument weight in method wrapper_CUDA__native_layer_norm) when executing trainer.test(model, datamodule=dm, ckpt_path=checkpoint_path).

If I save the trained model manually instead (i.e. uncomment trainer.save_checkpoint(checkpoint_path)) and use the checkpoint_path for testing, this script will run smoothly.

The code provided in the next cell is saved in a file named mwe.py and executed as follows

TOKENIZERS_PARALLELISM=true CUDA_VISIBLE_DEVICES="0,1,2,3" python mwe.py

What version are you seeing the problem on?

v2.0

How to reproduce the bug

import argparse
import json
import time
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
import torch.distributed as dist
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)
from transformers import DataCollatorWithPadding
import pandas as pd
import peft
import warnings
from lightning import (
    Trainer,
    LightningDataModule,
    LightningModule,
    seed_everything,
)
from lightning.pytorch.loggers import TensorBoardLogger
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.utilities.deepspeed import (
    convert_zero_checkpoint_to_fp32_state_dict,
)
from deepspeed.ops.adam import FusedAdam

warnings.filterwarnings("ignore")

torch.set_float32_matmul_precision("medium")
seed_everything(42)

def generate_seeded_prompts(for_training=True):
    if for_training:
        return ["training text" for _ in range(100)]
    else:
        return ["testing text" for _ in range(10)]

def get_transformer(pretrained_fm="tiiuae/falcon-7b-instruct"):
    """

    Args:
        pretrained_fm (str, optional): Location of pre-trained foundation model. Defaults to "tiiuae/falcon-7b-instruct".

    Returns:
        Transformer corresponding to the pretrained model
    """
    model = AutoModelForCausalLM.from_pretrained(
        pretrained_fm,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
    )
    return model

def get_tokenizer(pretrained_fm="tiiuae/falcon-7b-instruct"):
    """

    Args:
        pretrained_fm (str, optional): Location of pre-trained foundation model. Defaults to "tiiuae/falcon-7b-instruct".

    Returns:
        Tokenizer corresponding to the pre-trained foundation model.
    """
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_fm,
        padding=True,
        trust_remote_code=True,
        use_fast=True,
    )
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.model_max_length = 512
    return tokenizer

class CustomDataModule(LightningDataModule):
    """

    Args:
        LightningDataModule : Custom datamodule class
    """

    def __init__(
        self,
        train_prompts,
        test_prompts,
        max_seq_length: int = 512,
        train_batch_size: int = 4,
        eval_batch_size: int = 4,
        test_batch_size: int = 2,
        train_val_split: float = 0.9,
        **kwargs,
    ):
        """

        Args:
            train_prompts (list of strings): prompts used for fine-tuning
            test_prompts (list of strings): prompts used for testing
            max_seq_length (int, optional): Max length of tokens where each prompt gets mapped. Defaults to 512.
            train_batch_size (int, optional): Minibatch size for training dataloader. Defaults to 4.
            eval_batch_size (int, optional): Minibatch size for validation dataloader. Defaults to 4.
            test_batch_size (int, optional): Minibatch size for testing dataloader. Defaults to 2.
            train_val_split (float, optional): Train to validation ratio applied for splitting the train_prompts. Defaults to 0.9.
        """
        super().__init__()
        self.train_prompts = train_prompts
        self.test_prompts = test_prompts
        self.dataset = {}
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.eval_batch_size = eval_batch_size
        self.test_batch_size = test_batch_size
        self.train_val_split = train_val_split
        self.tokenizer = get_tokenizer()
        self.data_collator = DataCollatorWithPadding(self.tokenizer)

    def setup(self, stage):
        train_val_size = int(self.train_val_split * len(self.train_prompts))

        train_data = self.train_prompts[:train_val_size]
        validation_data = self.train_prompts[train_val_size:]
        test_data = self.test_prompts

        train_df = pd.DataFrame({"text": train_data})
        validation_df = pd.DataFrame({"text": validation_data})
        test_df = pd.DataFrame({"text": test_data})

        self.dataset["train"] = Dataset.from_pandas(train_df)
        self.dataset["validation"] = Dataset.from_pandas(validation_df)
        self.dataset["test"] = Dataset.from_pandas(test_df)

        self.dataset["train"] = self.dataset["train"].map(self.tokenize)
        self.dataset["validation"] = self.dataset["validation"].map(self.tokenize)
        self.dataset["test"] = self.dataset["test"].map(
            lambda example_batch: self.tokenize(example_batch, padding=True)
        )

        self.dataset["train"].set_format(type="torch")
        self.dataset["validation"].set_format(type="torch")
        self.dataset["test"].set_format(type="torch")

        self.dataset["train"] = self.dataset["train"].remove_columns(["text"])
        self.dataset["validation"] = self.dataset["validation"].remove_columns(["text"])
        self.dataset["test"] = self.dataset["test"].remove_columns(["text"])

        print(f"Train size {len(self.dataset['train'])}")
        print(f"Validation size {len(self.dataset['validation'])}")
        print(f"Test size {len(self.dataset['test'])}")

    def train_dataloader(self):
        return DataLoader(
            self.dataset["train"],
            batch_size=self.train_batch_size,
            shuffle=True,
            num_workers=8,
            collate_fn=self.data_collator,
        )

    def val_dataloader(self):
        return DataLoader(
            self.dataset["validation"],
            batch_size=self.eval_batch_size,
            shuffle=False,
            num_workers=8,
            collate_fn=self.data_collator,
        )

    def test_dataloader(self):
        return DataLoader(
            self.dataset["test"],
            batch_size=self.test_batch_size,
            shuffle=False,
            num_workers=8,
            collate_fn=self.data_collator,
        )

    def tokenize(self, example_batch, indices=None, padding=True):
        features = self.tokenizer(
            example_batch["text"],
            padding=padding,
            truncation=True,
            max_length=512,
        )
        return {
            "input_ids": features["input_ids"],
            "attention_mask": features["attention_mask"],
        }

def setup_peft_config(peft_method="LORA"):
    """

    Args:
        peft_method (str, optional):  Defaults to "LORA".

    Returns:
        Sets up the configuration required for applying a PEFT method on the pre-trained model.
    """
    if peft_method == "IA3":
        peft_config = peft.IA3Config(
            task_type="CAUSAL_LM",
            inference_mode=False,
            target_modules=["query_key_value"],
            feedforward_modules=["dense_h_to_4h", "dense_4h_to_h"],
        )
    elif peft_method == "LORA":
        peft_config = peft.LoraConfig(
            task_type="CAUSAL_LM",
            inference_mode=False,
            r=8,
            lora_alpha=8,
            lora_dropout=0.1,
            target_modules=[
                "query_key_value",
                "dense_h_to_4h",
                "dense_4h_to_h",
            ],
        )
    return peft_config

class LitFM(LightningModule):
    """

    Args:
        LightningModule for fine-tuning and testing
    """

    def __init__(
        self,
        use_deepspeed=True,
        peft_method="LORA",
        sanity_check=True,
        test_dataloader=None,
        dnm=None,
        fm=None,
    ):
        """

        Args:
            use_deepspeed (bool, optional): Flag for use of deepspeed acceleration. Defaults to True.
            peft_method (str, optional): PEFT method to be used for fine-tuning. Defaults to "LORA".
            sanity_check (bool, optional): Flag for printing expected input for model forward. Defaults to False. # To be removed
            test_dataloader (dataloader, optional): Test dataloader . Defaults to None. # To be removed
            dnm (str, optional): Fine-tuning dataset name. Defaults to None.
        """
        super().__init__()
        self.tokenizer, self.model = get_tokenizer(pretrained_fm=fm), get_transformer(
            pretrained_fm=fm
        )
        self.model.config.pad_token_id = self.tokenizer.eos_token_id
        if peft_method:
            peft_config = setup_peft_config(peft_method=peft_method)
            print(f"applying {peft_method} PEFT")
            self.model = peft.get_peft_model(self.model, peft_config)
            print("preparations for PEFT done")
            self.model.print_trainable_parameters()
        self.validation_step_outputs = []
        self.use_deepspeed = use_deepspeed
        self.sanity_check = sanity_check
        self.test_dataloader = test_dataloader
        self.dnm, self.fm = dnm, fm
        self.save_hyperparameters()
        self.start_time = time.time()

    def forward(self, **inputs):
        return self.model(**inputs)

    def training_step(self, batch):
        kwargs_for_forward = {
            "input_ids": batch["input_ids"],
            "attention_mask": batch["attention_mask"],
            "labels": batch["input_ids"],
        }
        if self.sanity_check:
            print("inside sanity check")
            # Convert token IDs back to text using the tokenizer's decode method
            decoded_text = self.tokenizer.decode(
                batch["input_ids"][0], skip_special_tokens=True
            )
            print(f"decoded training input : {decoded_text}")

        model_output = self(**kwargs_for_forward)
        loss = model_output.loss
        tensorboard_logs = (
            {"training_loss": loss, "step": self.current_epoch}
            if self.dnm == "ML"
            else {"training_loss": loss}
        )
        # if not (self.use_deepspeed or self.use_ddp) or dist.get_rank() == 0:
        self.log_dict(tensorboard_logs)
        return loss

    def validation_step(self, batch, batch_idx):
        kwargs_for_forward = {
            "input_ids": batch["input_ids"],
            "attention_mask": batch["attention_mask"],
            "labels": batch["input_ids"],
        }
        model_output = self(**kwargs_for_forward)
        loss = model_output.loss
        tensorboard_logs = (
            {"val_loss": loss, "step": self.current_epoch}
            if self.dnm == "ML"
            else {"val_loss": loss}
        )
        # if not (self.use_deepspeed or self.use_ddp) or dist.get_rank() == 0:
        self.log_dict(tensorboard_logs)
        return loss

    def configure_optimizers(self):
        return FusedAdam(self.parameters(), lr=1e-3, weight_decay=1e-2)

    def test_step(self, test_batch, test_batch_idx):
        print("inside test step")
        # Generate text using the model
        # with torch.inference_mode():
        outputs = self.model.generate(
            input_ids=test_batch["input_ids"],
            attention_mask=test_batch["attention_mask"],
            num_return_sequences=1,
            max_new_tokens=512,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
            do_sample=True,
            top_k=10,
        )
        # for output in outputs:
        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"generated text : {generated_text}")
        if dist.get_rank() == 0:
            update_recommendations_outfile(generated_text, dnm=self.dnm, fm=self.fm)

# start a fine-tuning task from scratch
def run(args):
    very_beginning = time.time()
    print("Generate prompts")
    train_prompts = generate_seeded_prompts()
    test_prompts = generate_seeded_prompts(for_training=False)
    print("setting up dataloaders")
    dm = CustomDataModule(train_prompts, test_prompts)
    dm.setup("")

    # Set up fine-tuning
    print("setting up model")
    logger = TensorBoardLogger(f"exp_out_{args.dataset}", name="log")
    model = LitFM(
        peft_method=args.peft_method,
        test_dataloader=dm.test_dataloader(),
        dnm=args.dataset,
        fm=args.fm,
    )
    checkpoint_callback = ModelCheckpoint(
        dirpath=f"./checkpoints_{args.dataset}",
        save_top_k=1,
        monitor="val_loss",
        mode="min",
        save_weights_only=True,
    )
    early_stopping_callback = EarlyStopping(monitor="val_loss", mode="min", patience=5)

    print("setting up trainer")
    trainer = Trainer(
        accelerator="gpu",
        precision="bf16",
        max_epochs=20,
        strategy="deepspeed_stage_3",
        num_sanity_val_steps=-1,
        check_val_every_n_epoch=1,
        log_every_n_steps=1,
        logger=logger,
        accumulate_grad_batches=4,
        # gradient_clip_val=1.0,
        callbacks=[checkpoint_callback, early_stopping_callback],
    )

    print("starting fit")
    if not args.checkpoint_path:
        trainer.fit(model, datamodule=dm)
    else:
        print(f"loading checkpoint from : {args.checkpoint_path}")
        trainer.fit(model, datamodule=dm, ckpt_path=args.checkpoint_path)

    checkpoint_path = f"deepspeed_ckpt"
    # trainer.save_checkpoint(checkpoint_path)
    checkpoint_path = checkpoint_callback.best_model_path

    trainer.test(model, datamodule=dm, ckpt_path=checkpoint_path)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process some integers.")
    parser.add_argument("--training_prompts", type=str, default="w/o description")
    parser.add_argument("--peft_method", type=str, default="LORA")
    parser.add_argument("--dataset", type=str, default="MWE")
    parser.add_argument("--checkpoint_path", type=str, default=None)
    parser.add_argument("--fm", type=str, default="tiiuae/falcon-7b-instruct")
    args = parser.parse_args()

    transformers.logging.set_verbosity_error()
    assert torch.cuda.is_available(), "GPU Required"

    run(args)

Error messages and logs

inside test step
Traceback (most recent call last):
  File "/home/ubuntu/dion/mwe.py", line 427, in <module>
    run(args)
  File "/home/ubuntu/dion/mwe.py", line 401, in run
    trainer.test(model, datamodule=dm, ckpt_path=checkpoint_path)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 742, in test
    return call._call_and_handle_interrupt(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 42, in _call_and_handle_interrupt
    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 93, in launch
    return function(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 785, in _test_impl
    results = self._run(model, ckpt_path=ckpt_path)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 980, in _run
    results = self._run_stage()
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 1016, in _run_stage
    return self._evaluation_loop.run()
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/loops/utilities.py", line 181, in _decorator
    return loop_run(self, *args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/loops/evaluation_loop.py", line 115, in run
    self._evaluation_step(batch, batch_idx, dataloader_idx)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/loops/evaluation_loop.py", line 376, in _evaluation_step
    output = call._call_strategy_hook(trainer, hook_name, *step_kwargs.values())
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 293, in _call_strategy_hook
    output = fn(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/strategies/deepspeed.py", line 919, in test_step
    return self.model(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
    ret_val = func(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1783, in forward
    loss = self.module(*inputs, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/overrides/base.py", line 100, in forward
    return self._forward_module.test_step(*inputs, **kwargs)
  File "/home/ubuntu/dion/mwe.py", line 324, in test_step
    outputs = self.model.generate(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/peft/peft_model.py", line 1110, in generate
    outputs = self.base_model.generate(**kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/transformers/generation/utils.py", line 1821, in generate
    return self.sample(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/transformers/generation/utils.py", line 3095, in sample
    outputs = self(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b-instruct/eb410fb6ffa9028e97adb801f0d6ec46d02f8b07/modelling_RW.py", line 753, in forward
    transformer_outputs = self.transformer(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b-instruct/eb410fb6ffa9028e97adb801f0d6ec46d02f8b07/modelling_RW.py", line 648, in forward
    outputs = block(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b-instruct/eb410fb6ffa9028e97adb801f0d6ec46d02f8b07/modelling_RW.py", line 381, in forward
    layernorm_output = self.input_layernorm(hidden_states)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 190, in forward
    return F.layer_norm(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/functional.py", line 2808, in layer_norm
    return torch.layer_norm(
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument weight in method wrapper_CUDA__native_layer_norm)
Traceback (most recent call last):
  File "/home/ubuntu/dion/mwe.py", line 427, in <module>
    run(args)
  File "/home/ubuntu/dion/mwe.py", line 401, in run
    trainer.test(model, datamodule=dm, ckpt_path=checkpoint_path)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 742, in test
    return call._call_and_handle_interrupt(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 42, in _call_and_handle_interrupt
    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 93, in launch
    return function(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 785, in _test_impl
    results = self._run(model, ckpt_path=ckpt_path)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 980, in _run
    results = self._run_stage()
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 1016, in _run_stage
    return self._evaluation_loop.run()
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/loops/utilities.py", line 181, in _decorator
    return loop_run(self, *args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/loops/evaluation_loop.py", line 115, in run
    self._evaluation_step(batch, batch_idx, dataloader_idx)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/loops/evaluation_loop.py", line 376, in _evaluation_step
    output = call._call_strategy_hook(trainer, hook_name, *step_kwargs.values())
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 293, in _call_strategy_hook
    output = fn(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/strategies/deepspeed.py", line 919, in test_step
    return self.model(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
    ret_val = func(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1783, in forward
    loss = self.module(*inputs, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/overrides/base.py", line 100, in forward
    return self._forward_module.test_step(*inputs, **kwargs)
  File "/home/ubuntu/dion/mwe.py", line 324, in test_step
    outputs = self.model.generate(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/peft/peft_model.py", line 1110, in generate
    outputs = self.base_model.generate(**kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/transformers/generation/utils.py", line 1821, in generate
    return self.sample(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/transformers/generation/utils.py", line 3095, in sample
    outputs = self(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b-instruct/eb410fb6ffa9028e97adb801f0d6ec46d02f8b07/modelling_RW.py", line 753, in forward
    transformer_outputs = self.transformer(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b-instruct/eb410fb6ffa9028e97adb801f0d6ec46d02f8b07/modelling_RW.py", line 648, in forward
    outputs = block(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b-instruct/eb410fb6ffa9028e97adb801f0d6ec46d02f8b07/modelling_RW.py", line 381, in forward
    layernorm_output = self.input_layernorm(hidden_states)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 190, in forward
    return F.layer_norm(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/functional.py", line 2808, in layer_norm
    return torch.layer_norm(
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cpu! (when checking argument for argument weight in method wrapper_CUDA__native_layer_norm)
Traceback (most recent call last):
  File "/home/ubuntu/dion/mwe.py", line 427, in <module>
    run(args)
  File "/home/ubuntu/dion/mwe.py", line 401, in run
    trainer.test(model, datamodule=dm, ckpt_path=checkpoint_path)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 742, in test
    return call._call_and_handle_interrupt(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 42, in _call_and_handle_interrupt
    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 93, in launch
    return function(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 785, in _test_impl
    results = self._run(model, ckpt_path=ckpt_path)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 980, in _run
    results = self._run_stage()
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 1016, in _run_stage
    return self._evaluation_loop.run()
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/loops/utilities.py", line 181, in _decorator
    return loop_run(self, *args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/loops/evaluation_loop.py", line 115, in run
    self._evaluation_step(batch, batch_idx, dataloader_idx)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/loops/evaluation_loop.py", line 376, in _evaluation_step
    output = call._call_strategy_hook(trainer, hook_name, *step_kwargs.values())
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 293, in _call_strategy_hook
    output = fn(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/strategies/deepspeed.py", line 919, in test_step
    return self.model(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
    ret_val = func(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1783, in forward
    loss = self.module(*inputs, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/overrides/base.py", line 100, in forward
    return self._forward_module.test_step(*inputs, **kwargs)
  File "/home/ubuntu/dion/mwe.py", line 324, in test_step
    outputs = self.model.generate(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/peft/peft_model.py", line 1110, in generate
    outputs = self.base_model.generate(**kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/transformers/generation/utils.py", line 1821, in generate
    return self.sample(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/transformers/generation/utils.py", line 3095, in sample
    outputs = self(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b-instruct/eb410fb6ffa9028e97adb801f0d6ec46d02f8b07/modelling_RW.py", line 753, in forward
    transformer_outputs = self.transformer(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b-instruct/eb410fb6ffa9028e97adb801f0d6ec46d02f8b07/modelling_RW.py", line 648, in forward
    outputs = block(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b-instruct/eb410fb6ffa9028e97adb801f0d6ec46d02f8b07/modelling_RW.py", line 381, in forward
    layernorm_output = self.input_layernorm(hidden_states)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 190, in forward
    return F.layer_norm(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/functional.py", line 2808, in layer_norm
    return torch.layer_norm(
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:3 and cpu! (when checking argument for argument weight in method wrapper_CUDA__native_layer_norm)
Traceback (most recent call last):
  File "/home/ubuntu/dion/mwe.py", line 427, in <module>
    run(args)
  File "/home/ubuntu/dion/mwe.py", line 401, in run
    trainer.test(model, datamodule=dm, ckpt_path=checkpoint_path)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 742, in test
    return call._call_and_handle_interrupt(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 42, in _call_and_handle_interrupt
    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 93, in launch
    return function(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 785, in _test_impl
    results = self._run(model, ckpt_path=ckpt_path)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 980, in _run
    results = self._run_stage()
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 1016, in _run_stage
    return self._evaluation_loop.run()
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/loops/utilities.py", line 181, in _decorator
    return loop_run(self, *args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/loops/evaluation_loop.py", line 115, in run
    self._evaluation_step(batch, batch_idx, dataloader_idx)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/loops/evaluation_loop.py", line 376, in _evaluation_step
    output = call._call_strategy_hook(trainer, hook_name, *step_kwargs.values())
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 293, in _call_strategy_hook
    output = fn(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/strategies/deepspeed.py", line 919, in test_step
    return self.model(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
    ret_val = func(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1783, in forward
    loss = self.module(*inputs, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/lightning/pytorch/overrides/base.py", line 100, in forward
    return self._forward_module.test_step(*inputs, **kwargs)
  File "/home/ubuntu/dion/mwe.py", line 324, in test_step
    outputs = self.model.generate(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/peft/peft_model.py", line 1110, in generate
    outputs = self.base_model.generate(**kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/transformers/generation/utils.py", line 1821, in generate
    return self.sample(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/transformers/generation/utils.py", line 3095, in sample
    outputs = self(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b-instruct/eb410fb6ffa9028e97adb801f0d6ec46d02f8b07/modelling_RW.py", line 753, in forward
    transformer_outputs = self.transformer(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b-instruct/eb410fb6ffa9028e97adb801f0d6ec46d02f8b07/modelling_RW.py", line 648, in forward
    outputs = block(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b-instruct/eb410fb6ffa9028e97adb801f0d6ec46d02f8b07/modelling_RW.py", line 381, in forward
    layernorm_output = self.input_layernorm(hidden_states)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 190, in forward
    return F.layer_norm(
  File "/home/ubuntu/dion/poc/lib/python3.10/site-packages/torch/nn/functional.py", line 2808, in layer_norm
    return torch.layer_norm(
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:2 and cpu! (when checking argument for argument weight in method wrapper_CUDA__native_layer_norm)```

Environment

absl-py==1.4.0
accelerate==0.21.0
aiohttp==3.8.5
aiosignal==1.3.1
anyio==3.7.1
arrow==1.2.3
async-timeout==4.0.3
attrs==23.1.0
backoff==2.2.1
beautifulsoup4==4.12.2
bitsandbytes==0.40.0.post4
black==23.7.0
blessed==1.20.0
cachetools==5.3.1
certifi==2023.7.22
charset-normalizer==3.2.0
click==8.1.7
cmake==3.27.2
croniter==1.4.1
datasets==2.14.4
dateutils==0.6.12
deepdiff==6.3.1
deepspeed==0.10.1
dill==0.3.7
einops==0.6.1
et-xmlfile==1.1.0
exceptiongroup==1.1.3
fastapi==0.101.1
filelock==3.12.2
frozenlist==1.4.0
fsspec==2023.6.0
google-auth==2.22.0
google-auth-oauthlib==1.0.0
grpcio==1.57.0
h11==0.14.0
hjson==3.1.0
huggingface-hub==0.16.4
idna==3.4
inquirer==3.1.3
itsdangerous==2.1.2
Jinja2==3.1.2
lightning==2.0.7
lightning-cloud==0.5.37
lightning-utilities==0.9.0
lit==16.0.6
Markdown==3.4.4
markdown-it-py==3.0.0
MarkupSafe==2.1.3
mdurl==0.1.2
mpmath==1.3.0
multidict==6.0.4
multiprocess==0.70.15
mypy-extensions==1.0.0
networkx==3.1
ninja==1.11.1
numpy==1.25.2
nvidia-cublas-cu11==11.10.3.66
nvidia-cuda-cupti-cu11==11.7.101
nvidia-cuda-nvrtc-cu11==11.7.99
nvidia-cuda-runtime-cu11==11.7.99
nvidia-cudnn-cu11==8.5.0.96
nvidia-cufft-cu11==10.9.0.58
nvidia-curand-cu11==10.2.10.91
nvidia-cusolver-cu11==11.4.0.1
nvidia-cusparse-cu11==11.7.4.91
nvidia-nccl-cu11==2.14.3
nvidia-nvtx-cu11==11.7.91
oauthlib==3.2.2
openpyxl==3.1.2
ordered-set==4.1.0
packaging==23.1
pandas==2.0.3
pathspec==0.11.2
peft==0.4.0
platformdirs==3.10.0
protobuf==4.24.1
psutil==5.9.5
py-cpuinfo==9.0.0
pyarrow==12.0.1
pyasn1==0.5.0
pyasn1-modules==0.3.0
pydantic==1.10.12
Pygments==2.16.1
PyJWT==2.8.0
python-dateutil==2.8.2
python-editor==1.0.4
python-multipart==0.0.6
pytorch-lightning==2.0.7
pytz==2023.3
PyYAML==6.0.1
readchar==4.0.5
regex==2023.8.8
requests==2.31.0
requests-oauthlib==1.3.1
rich==13.5.2
rsa==4.9
safetensors==0.3.2
scipy==1.11.2
six==1.16.0
sniffio==1.3.0
soupsieve==2.4.1
starlette==0.27.0
starsessions==1.3.0
sympy==1.12
tensorboard==2.14.0
tensorboard-data-server==0.7.1
tokenizers==0.13.3
tomli==1.2.3
torch==2.0.1
torchmetrics==1.0.3
tqdm==4.66.1
traitlets==5.9.0
transformers==4.31.0
triton==2.0.0
typing_extensions==4.7.1
tzdata==2023.3
urllib3==1.26.16
uvicorn==0.23.2
wcwidth==0.2.6
websocket-client==1.6.1
websockets==11.0.3
Werkzeug==2.3.7
xxhash==3.3.0
yarl==1.9.2

cc @awaelchli

dionman commented 1 year ago

@awaelchli