NVIDIA / NeMo

A scalable generative AI framework built for researchers and developers working on Large Language Models, Multimodal, and Speech AI (Automatic Speech Recognition and Text-to-Speech)
https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html
Apache License 2.0
12.03k stars 2.51k forks source link

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! #2712

Closed ali2iptoki closed 3 years ago

ali2iptoki commented 3 years ago

When I execute the training on hi-mia I get the following exception (noting that I removed some speakers from the dataset):

~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloader, val_dataloaders, datamodule)
    456         )
    457 
--> 458         self._run(model)
    459 
    460         assert self.state.stopped

~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in _run(self, model)
    754 
    755         # dispatch `start_training` or `start_evaluating` or `start_predicting`
--> 756         self.dispatch()
    757 
    758         # plugin will finalized fitting (e.g. ddp_spawn will load trained model)

~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in dispatch(self)
    795             self.accelerator.start_predicting(self)
    796         else:
--> 797             self.accelerator.start_training(self)
    798 
    799     def run_stage(self):

~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py in start_training(self, trainer)
     94 
     95     def start_training(self, trainer: 'pl.Trainer') -> None:
---> 96         self.training_type_plugin.start_training(trainer)
     97 
     98     def start_evaluating(self, trainer: 'pl.Trainer') -> None:

~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in start_training(self, trainer)
    142     def start_training(self, trainer: 'pl.Trainer') -> None:
    143         # double dispatch to initiate the training loop
--> 144         self._results = trainer.run_stage()
    145 
    146     def start_evaluating(self, trainer: 'pl.Trainer') -> None:

~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in run_stage(self)
    805         if self.predicting:
    806             return self.run_predict()
--> 807         return self.run_train()
    808 
    809     def _pre_training_routine(self):

~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in run_train(self)
    867                 with self.profiler.profile("run_training_epoch"):
    868                     # run train epoch
--> 869                     self.train_loop.run_training_epoch()
    870 
    871                 if self.max_steps and self.max_steps <= self.global_step:

~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py in run_training_epoch(self)
    582         if should_check_val:
    583             self.trainer.validating = True
--> 584             self.trainer.run_evaluation(on_epoch=True)
    585             self.trainer.training = True
    586 

~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in run_evaluation(self, on_epoch)
   1004 
   1005         # hook
-> 1006         self.evaluation_loop.on_evaluation_end()
   1007 
   1008         # log epoch metrics

~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/evaluation_loop.py in on_evaluation_end(self, *args, **kwargs)
    100             self.trainer.call_hook('on_test_end', *args, **kwargs)
    101         else:
--> 102             self.trainer.call_hook('on_validation_end', *args, **kwargs)
    103 
    104         if self.trainer.state.fn != TrainerFn.FITTING:

~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in call_hook(self, hook_name, *args, **kwargs)
   1221             if hasattr(self, hook_name):
   1222                 trainer_hook = getattr(self, hook_name)
-> 1223                 trainer_hook(*args, **kwargs)
   1224 
   1225             # next call hook in lightningModule

~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/trainer/callback_hook.py in on_validation_end(self)
    225         """Called when the validation loop ends."""
    226         for callback in self.callbacks:
--> 227             callback.on_validation_end(self, self.lightning_module)
    228 
    229     def on_test_start(self):

~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py in on_validation_end(self, trainer, pl_module)
    247         if skip:
    248             return
--> 249         self.save_checkpoint(trainer)
    250 
    251     def on_save_checkpoint(

~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py in save_checkpoint(self, trainer, unused)
    296         # here we call each mode sequentially
    297         # Mode 1: save the top k checkpoints
--> 298         self._save_top_k_checkpoint(trainer, monitor_candidates)
    299         # Mode 2: save monitor=None checkpoints
    300         self._save_none_monitor_checkpoint(trainer, monitor_candidates)

~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py in _save_top_k_checkpoint(self, trainer, monitor_candidates)
    667 
    668         if self.check_monitor_top_k(trainer, current):
--> 669             self._update_best_and_save(current, trainer, monitor_candidates)
    670         elif self.verbose:
    671             epoch = monitor_candidates.get("epoch")

~/repos/algo/app/aliSrc/aliEnv/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py in _update_best_and_save(self, current, trainer, monitor_candidates)
    718 
    719         _op = min if self.mode == "min" else max
--> 720         self.best_model_path = _op(self.best_k_models, key=self.best_k_models.get)
    721         self.best_model_score = self.best_k_models[self.best_model_path]
    722 

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!
nithinraok commented 3 years ago

Can you provide following information:

  1. Which script are you using to train? and its config file.
  2. Did you set number of gpus in config? and accelerartor to ddp (default)
  3. Which branch are you usint to run and make sure its up to date.
ali2iptoki commented 3 years ago

@nithinraok I am using the following script:

import os
import torch

import pytorch_lightning as pl
from omegaconf.listconfig import ListConfig
from pytorch_lightning import seed_everything

from nemo.collections.asr.models import EncDecSpeakerLabelModel
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager
from argparse import ArgumentParser
from omegaconf import OmegaConf
import time
import sys

seed_everything(42)

# You can use this script by executing the following command:

# for hi-mia:
# python /home/ubuntu/repos/algo/app/aliSrc/src/train.py --data_dir "/home/ubuntu/repos/algo/app/aliSrc/dataset/himia/train" --num_classes "64" --model_name "himia.nemo" --config_name 'SpeakerNet_recognition_3x2x512.yaml'

# In general, arguments are:
# --data_dir    : where is our data
# --num_classes : number of unique labels in our dataset
# --model_name  : name we want to give to our save trained model
# --config_name : name of the configuration file used
# --batch_size  : batch to consider 
# --type_model  : what task we do
# --max_epochs  : number maximum d'epochs

def timer(start,end):
  hours, rem = divmod(end-start, 3600)
  minutes, seconds = divmod(rem, 60)
  logging.info("Elapsed time {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

def main():

    seed_everything(42)
    parser = ArgumentParser()

    parser.add_argument(
        "--data_dir", type=str, required=True, help="path to train data directory"
    )

    parser.add_argument(
        "--conf_dir", type=str, default="/home/ubuntu/repos/algo/app/aliSrc/notebooks/conf", help="path to conf directory"
    )

    parser.add_argument(
        "--config_name", type=str, required=True, help="name of the configuration file"
    )

    parser.add_argument("--batch_size", type=int, default=64)
    parser.add_argument("--type_model", type=str, default='verification', help="task to done identification or verification" )
    parser.add_argument("--num_classes", type=int, default=2)
    parser.add_argument("--max_epochs", type=int, default=200)
    parser.add_argument("--model_name", type=str, help = "name to give for the learned model")
    parser.add_argument("--exp_dir", type=str, default="/home/ubuntu/repos/algo/app/aliSrc/notebooks/nemo_experiments/SpeakerNet", help = "path to experiment directory")

    args = parser.parse_args()
    logging.info(args)

    conf_dir = args.conf_dir
    config_name = args.config_name
    MODEL_CONFIG = os.path.join(conf_dir, config_name)

    cfg = OmegaConf.load(MODEL_CONFIG)
    #logging.info(OmegaConf.to_yaml(cfg))  

    #  set up the paths of manifests in the config
    train_manifest = os.path.join(args.data_dir,'train.json')
    test_manifest = os.path.join(args.data_dir,'dev.json')

    cfg.model.train_ds.manifest_filepath = train_manifest
    cfg.model.validation_ds.manifest_filepath = train_manifest
    cfg.model.test_ds.manifest_filepath = test_manifest

    cfg.model.decoder.num_classes =  args.num_classes

    if args.type_model == 'verification':
        cfg.model.decoder.angular = True

    # Checks if we have GPU available and uses it
    cuda = 1 if torch.cuda.is_available() else 0
    cfg.trainer.gpus = cuda

    if cuda == 0:
        logging.info(f"cuda = {cuda} and it is not available. Please try again!")
        sys.exit()

    # Set up the max_epochs parameters. 
    cfg.trainer.max_epochs =  args.max_epochs

    # Remove distributed training flags
    cfg.trainer.accelerator = None

    cfg.trainer.amp_level ='O1'
    cfg.trainer.precision = 16

    trainer = pl.Trainer(**cfg.trainer)

    cfg.exp_manager.name= cfg.exp_manager.use_datetime_version=False
    cfg.exp_manager.exp_dir=os.path.join(args.exp_dir, './speaker_exps')

    log_dir = exp_manager(trainer, cfg.get("exp_manager", None))

    speaker_model = EncDecSpeakerLabelModel(cfg=cfg.model, trainer=trainer)
    logging.info(log_dir)

    time_start =  time.time()
    trainer.fit(speaker_model) #speaker_model is an EncDecSpeakerLabelModel
    time_end = time.time()
    timer(time_start, time_end)

    # .nemo model
    model_name = args.model_name
    if not trainer.fast_dev_run:
        model_path = os.path.join(log_dir, '..', model_name)
        speaker_model.save_to(model_path)

# the main to run the training
if __name__ == '__main__':
    main()

and I am using the SpeakerNet_recognition_3x2x512.yaml

nithinraok commented 3 years ago

All the things in this script are duplicated and not required. You may simply use speaker_reco.py script.

you may call the script as

EXP_NAME=sample_run
python ./speaker_reco.py --config_path='conf' --config_name='SpeakerNet_verification_3x2x512.yaml' \
    trainer.max_epochs=30  \
    model.train_ds.batch_size=64 model.validation_ds.batch_size=64 \
    model.train_ds.manifest_filepath="<train_manifest>" model.validation_ds.manifest_filepath="<dev_manifest>" \
    trainer.gpus=<NUM_GPUS> \
    model.decoder.params.num_classes=<NUM_CLASSES> \
    exp_manager.name=$EXP_NAME +exp_manager.use_datetime_version=False \
    exp_manager.exp_dir='./speaker_exps'

Some Suggestions:

  1. Don't use precision and amp_level settings for now. Set precision=32 and ignore the amp_level (this will only come into effect if you installed nvidia apex, native pytorch supports precision now).
  2. you have set accelerator = None, default is ddp
  3. You are using the same set for training and validation, use part of the train, not the whole set.
  4. If you are training, you dont need to use test set, if you would like to use test set to infer the labels then add test_ds section to config as similar to validation_ds and train_ds. Or you can also use inference script once training is finished and you have your trained model.