Deci-AI / super-gradients

Easily train or fine-tune SOTA computer vision models with one open source training library. The home of Yolo-NAS.
https://www.supergradients.com
Apache License 2.0
4.59k stars 509 forks source link

Code crash during multiple training epochs running DDP #2065

Open TychoBomer opened 4 days ago

TychoBomer commented 4 days ago

💡 Your Question

So when training all seems fine but during training after a few epochs the training stops and i get an error looking like this:

torch.distributed.elastic.multiprocessing.api.SignalException: Process 22636 got signal: 2

Now the problem clearly is in DDP, but it happens during training what I find very strange. Also it does not always happens as the training has also been completed once for 999 iterations. Now it mostly crashes around 30-80 epochs.

I hav two GPUS and i do see that they are initially both working so the DDP must be setup correctly but then it just stops

import os from super_gradients import init_trainer from super_gradients.training import models, Trainer from super_gradients.common import MultiGPUMode # needed for parallel processing from super_gradients.training.utils.distributed_training_utils import setup_device from super_gradients.training.dataloaders.dataloaders import coco_detection_yolo_format_train, coco_detection_yolo_format_val from super_gradients.training.losses import PPYoloELoss from super_gradients.training.metrics import DetectionMetrics_050, DetectionMetrics_050_095, DetectionMetrics_075 from super_gradients.training.models.detection_models.pp_yolo_e import PPYoloEPostPredictionCallback

Train configuration files

from configurations import TrainModelConfig, TrainParamConfig

def _train():

NOTE CHECKPOINT_DIR MUST BE AN ABSOLUTE PATH FOR MODEL LOADING!!

# Get the absolute path of the current script or working directory
project_root = os.path.dirname(os.path.abspath(__file__))
CHECKPOINT_DIR : str = os.path.join(project_root, TrainModelConfig.checkpoints_dir_name)
CHECKPOINT_DIR_FOLDER : str = os.path.abspath(CHECKPOINT_DIR)
os.makedirs(CHECKPOINT_DIR_FOLDER, exist_ok=True)

classes_file = os.path.join(TrainModelConfig.dataset_folder_location, "classes.txt")
CLASSES = [line.strip() for line in open(classes_file)]

# Build trainer object
trainer = Trainer(experiment_name=TrainModelConfig.experiment_name, ckpt_root_dir=CHECKPOINT_DIR_FOLDER)
# Specify model
model = models.get(
    TrainModelConfig.model_version, 
    num_classes=len(CLASSES),
    pretrained_weights='coco').to(TrainModelConfig.device)

# Dataset parameters for the dataloader
dataset_params = {
    'data_dir': TrainModelConfig.dataset_folder_location,
    'train_images_dir': 'train/images',
    'train_labels_dir': 'train/labels',
    'val_images_dir': 'val/images',
    'val_labels_dir': 'val/labels',
    'test_images_dir': 'test/images',
    'test_labels_dir': 'test/labels',
    'classes': CLASSES,
    'input_dim': TrainModelConfig.input_dim # change to desired input dimension #TODO: extracting from train image sizes? -> they seem to differ in the set..

}

# setting up dataloaders
train_data = coco_detection_yolo_format_train(
    dataset_params={
        'data_dir': dataset_params['data_dir'],
        # 'input_dim': dataset_params['input_dim'],
        'images_dir': dataset_params['train_images_dir'],
        'labels_dir': dataset_params['train_labels_dir'],
        'classes': dataset_params['classes']
    },
    dataloader_params={
        'batch_size': TrainModelConfig.batch_size,
        'num_workers': TrainModelConfig.num_workers
    }
)

val_data = coco_detection_yolo_format_val(
    dataset_params={
        'data_dir': dataset_params['data_dir'],
        # 'input_dim': dataset_params['input_dim'],
        'images_dir': dataset_params['val_images_dir'],
        'labels_dir': dataset_params['val_labels_dir'],
        'classes': dataset_params['classes']
    },
    dataloader_params={
        'batch_size': TrainModelConfig.batch_size,
        'num_workers': TrainModelConfig.num_workers
    }
)

test_data = coco_detection_yolo_format_val(
    dataset_params={
        'data_dir': dataset_params['data_dir'],
        # 'input_dim': dataset_params['input_dim'],
        'images_dir': dataset_params['test_images_dir'],
        'labels_dir': dataset_params['test_labels_dir'],
        'classes': dataset_params['classes']
    },
    dataloader_params={
        'batch_size': TrainModelConfig.batch_size,
        'num_workers': TrainModelConfig.num_workers
    }
)

# REMOVE MIXUP from data augmentation list
train_data.dataset.transforms.pop(2)

# training parameter setup
train_params = {
    'run_validation_freq': TrainParamConfig.run_validation_freq,
    'run_test_freq': TrainParamConfig.run_test_freq ,
    'silent_mode': TrainParamConfig.silent_mode,
    "average_best_models": TrainParamConfig.average_best_models,
    "warmup_mode": TrainParamConfig.warmup_mode,
    "warmup_initial_lr": TrainParamConfig.warmup_initial_lr,
    "lr_warmup_epochs": TrainParamConfig.lr_warmup_epochs,
    "initial_lr": TrainParamConfig.initial_lr,
    "lr_mode": TrainParamConfig.lr_mode,
    "cosine_final_lr_ratio": TrainParamConfig.cosine_final_lr_ratio,
    "optimizer": TrainParamConfig.optimizer,
    "optimizer_params": TrainParamConfig.optimizer_params,
    "zero_weight_decay_on_bias_and_bn": TrainParamConfig.zero_weight_decay_on_bias_and_bn,
    "ema": TrainParamConfig.ema,
    "ema_params": TrainParamConfig.ema_params,
    "max_epochs": TrainParamConfig.max_epochs,
    "mixed_precision": TrainParamConfig.mixed_precision,
    "loss": PPYoloELoss(
        use_static_assigner=False,
        num_classes=len(dataset_params['classes']),
        reg_max=16
    ),
    "valid_metrics_list": 
    [
    DetectionMetrics_050(
        score_thres=0.4,
        top_k_predictions=300,
        num_cls=len(dataset_params['classes']),
        normalize_targets=True,
        post_prediction_callback=PPYoloEPostPredictionCallback(
            score_threshold=0.01,
            nms_top_k=1000,
            max_predictions=300,
            nms_threshold=0.7
        ),
        calc_best_score_thresholds=True

    ),
    DetectionMetrics_050_095(
        score_thres=0.4,
        top_k_predictions=300,
        num_cls=len(dataset_params['classes']),
        normalize_targets=True,
        post_prediction_callback=PPYoloEPostPredictionCallback(
            score_threshold=0.01,
            nms_top_k=1000,
            max_predictions=300,
            nms_threshold=0.7
        ),
        calc_best_score_thresholds=True
    )
    # NOTE _075 not yet supported
    # DetectionMetrics_075(
    #     score_thres=0.1,
    #     top_k_predictions=300,
    #     num_cls=len(dataset_params['classes']),
    #     normalize_targets=True,
    #     post_prediction_callback=PPYoloEPostPredictionCallback(
    #         score_threshold=0.01,
    #         nms_top_k=1000,
    #         max_predictions=300,
    #         nms_threshold=0.7
    #     )
    # )
    ],
    "metric_to_watch": 'mAP@0.50',
    "sg_logger": "clearml_sg_logger",
    "sg_logger_params":                 # Params that will be passes to __init__ of the logger super_gradients.common.sg_loggers.wandb_sg_logger.ClearMLSGLogger 
    {
        "project_name": "BHTDefectDetection", # ClearML project name
        "save_checkpoints_remote": False,
        "save_tensorboard_remote": True,
        "save_logs_remote": True,
    }
}

# Start the trainer using setup (see configurations.py)
trainer.train(
    model=model,
    training_params=train_params,
    train_loader=train_data,
    valid_loader=val_data,
    test_loaders={'test_set': test_data}
)

def train() -> None: # main entry point to start the training and setup GPU device

Setup environment trainer

init_trainer()
if TrainModelConfig.multi_gpu_version.upper() == 'DDP':
    # Launch DDP on num_gpu GPUs' this should be the go to for parralel processing!
    setup_device(device= TrainModelConfig.device, multi_gpu=MultiGPUMode.DISTRIBUTED_DATA_PARALLEL, num_gpus=TrainModelConfig.num_gpus)
elif TrainModelConfig.multi_gpu_version.upper() == 'DP':
    # Launch DP on num_gpu GPUs' -> NOTE: not working (yet)
    setup_device(multi_gpu=MultiGPUMode.DATA_PARALLEL, num_gpus=TrainModelConfig.num_gpus)

# Call _train()
_train()

if name == 'main': train()

Versions

No response

msciancalepore98 commented 4 days ago

How are you launching your training process? This can happen if you use nohup or even multiplexers like tmux

TychoBomer commented 3 days ago

How are you launching your training process? This can happen if you use nohup or even multiplexers like tmux

This was indeed the problem, thanks!