Closed ali2iptoki closed 3 years ago
Can you provide following information:
ddp
(default) @nithinraok I am using the following script:
import os
import torch
import pytorch_lightning as pl
from omegaconf.listconfig import ListConfig
from pytorch_lightning import seed_everything
from nemo.collections.asr.models import EncDecSpeakerLabelModel
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager
from argparse import ArgumentParser
from omegaconf import OmegaConf
import time
import sys
seed_everything(42)
# You can use this script by executing the following command:
# for hi-mia:
# python /home/ubuntu/repos/algo/app/aliSrc/src/train.py --data_dir "/home/ubuntu/repos/algo/app/aliSrc/dataset/himia/train" --num_classes "64" --model_name "himia.nemo" --config_name 'SpeakerNet_recognition_3x2x512.yaml'
# In general, arguments are:
# --data_dir : where is our data
# --num_classes : number of unique labels in our dataset
# --model_name : name we want to give to our save trained model
# --config_name : name of the configuration file used
# --batch_size : batch to consider
# --type_model : what task we do
# --max_epochs : number maximum d'epochs
def timer(start,end):
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
logging.info("Elapsed time {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
def main():
seed_everything(42)
parser = ArgumentParser()
parser.add_argument(
"--data_dir", type=str, required=True, help="path to train data directory"
)
parser.add_argument(
"--conf_dir", type=str, default="/home/ubuntu/repos/algo/app/aliSrc/notebooks/conf", help="path to conf directory"
)
parser.add_argument(
"--config_name", type=str, required=True, help="name of the configuration file"
)
parser.add_argument("--batch_size", type=int, default=64)
parser.add_argument("--type_model", type=str, default='verification', help="task to done identification or verification" )
parser.add_argument("--num_classes", type=int, default=2)
parser.add_argument("--max_epochs", type=int, default=200)
parser.add_argument("--model_name", type=str, help = "name to give for the learned model")
parser.add_argument("--exp_dir", type=str, default="/home/ubuntu/repos/algo/app/aliSrc/notebooks/nemo_experiments/SpeakerNet", help = "path to experiment directory")
args = parser.parse_args()
logging.info(args)
conf_dir = args.conf_dir
config_name = args.config_name
MODEL_CONFIG = os.path.join(conf_dir, config_name)
cfg = OmegaConf.load(MODEL_CONFIG)
#logging.info(OmegaConf.to_yaml(cfg))
# set up the paths of manifests in the config
train_manifest = os.path.join(args.data_dir,'train.json')
test_manifest = os.path.join(args.data_dir,'dev.json')
cfg.model.train_ds.manifest_filepath = train_manifest
cfg.model.validation_ds.manifest_filepath = train_manifest
cfg.model.test_ds.manifest_filepath = test_manifest
cfg.model.decoder.num_classes = args.num_classes
if args.type_model == 'verification':
cfg.model.decoder.angular = True
# Checks if we have GPU available and uses it
cuda = 1 if torch.cuda.is_available() else 0
cfg.trainer.gpus = cuda
if cuda == 0:
logging.info(f"cuda = {cuda} and it is not available. Please try again!")
sys.exit()
# Set up the max_epochs parameters.
cfg.trainer.max_epochs = args.max_epochs
# Remove distributed training flags
cfg.trainer.accelerator = None
cfg.trainer.amp_level ='O1'
cfg.trainer.precision = 16
trainer = pl.Trainer(**cfg.trainer)
cfg.exp_manager.name= cfg.exp_manager.use_datetime_version=False
cfg.exp_manager.exp_dir=os.path.join(args.exp_dir, './speaker_exps')
log_dir = exp_manager(trainer, cfg.get("exp_manager", None))
speaker_model = EncDecSpeakerLabelModel(cfg=cfg.model, trainer=trainer)
logging.info(log_dir)
time_start = time.time()
trainer.fit(speaker_model) #speaker_model is an EncDecSpeakerLabelModel
time_end = time.time()
timer(time_start, time_end)
# .nemo model
model_name = args.model_name
if not trainer.fast_dev_run:
model_path = os.path.join(log_dir, '..', model_name)
speaker_model.save_to(model_path)
# the main to run the training
if __name__ == '__main__':
main()
and I am using the SpeakerNet_recognition_3x2x512.yaml
All the things in this script are duplicated and not required. You may simply use speaker_reco.py script.
you may call the script as
EXP_NAME=sample_run
python ./speaker_reco.py --config_path='conf' --config_name='SpeakerNet_verification_3x2x512.yaml' \
trainer.max_epochs=30 \
model.train_ds.batch_size=64 model.validation_ds.batch_size=64 \
model.train_ds.manifest_filepath="<train_manifest>" model.validation_ds.manifest_filepath="<dev_manifest>" \
trainer.gpus=<NUM_GPUS> \
model.decoder.params.num_classes=<NUM_CLASSES> \
exp_manager.name=$EXP_NAME +exp_manager.use_datetime_version=False \
exp_manager.exp_dir='./speaker_exps'
Some Suggestions:
ddp
When I execute the training on hi-mia I get the following exception (noting that I removed some speakers from the dataset):