ray-project / ray

Ray is an AI compute engine. Ray consists of a core distributed runtime and a set of AI Libraries for accelerating ML workloads.
https://ray.io
Apache License 2.0
34.01k stars 5.78k forks source link

How can I hyperparameter tune over multiple GPUs? #41732

Open HannahAlexander opened 11 months ago

HannahAlexander commented 11 months ago

Hi, I can train my model over 4 GPUs or i can apply hyperparameter tuning to my model, but I cant seem to do both at the same time. When i try to hyperparameter tune by setting gpus = 4, only 1 gpu is used. In the code below i have tried to combine both methods but it doesnt work because it creates 2 log folders with separate names (based on the current time). Does anyone know how i can achieve the hyperparameter tuning over multiple GPUs?

import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from filelock import FileLock
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
import ray
from ray import train, tune
from ray.train import Checkpoint
from ray.tune.schedulers import ASHAScheduler
from super_gradients.training.utils.distributed_training_utils import setup_device
from super_gradients.common.data_types.enum import MultiGPUMode
from super_gradients.training import Trainer
from super_gradients.training.dataloaders.dataloaders import (
    coco_detection_yolo_format_train,
    coco_detection_yolo_format_val,
)
from super_gradients.training import models
from super_gradients.training.losses import PPYoloELoss
from super_gradients.training.metrics import (
    DetectionMetrics_050,
    DetectionMetrics_050_095,
)
from super_gradients.training.models.detection_models.pp_yolo_e import (
    PPYoloEPostPredictionCallback,
)
import sys
import os
sys.path.append("src")
from config import dataset_params
from ray.train import ScalingConfig
sys.path.append("src/visualisations")
from model_plots import plot_confusion_matrix
from ray.train.torch import TorchTrainer

def load_data(dataset_params = dataset_params):

    # list of the possible classes
    classes = [
        "drone",
        "Herc",
        "Bird",
        "FixedWing",
        "Person",
        "Aircraft",
        "Helicopter",
        "FighterJet",
    ]

    dataset_params = {
        "data_dir": "../../../SageMaker/sample_data_small",  # root directory of data
        "train_images_dir": "images/train/",  # train images
        "train_labels_dir": "labels/train/",  # train labels
        "val_images_dir": "images/valid/",  # validation images
        "val_labels_dir": "labels/valid/",  # validation labels
        "test_images_dir": "images/test/",  # test images
        "test_labels_dir": "labels/test/",  # test labels
        "classes": classes,
    }

    # create dataloaders for yolonas model
    train_data = coco_detection_yolo_format_train(
        dataset_params={
            "data_dir": dataset_params["data_dir"],
            "images_dir": dataset_params["train_images_dir"],
            "labels_dir": dataset_params["train_labels_dir"],
            "classes": dataset_params["classes"],
        },
        dataloader_params={
            "shuffle": True,
            "pin_memory": True,
            "batch_size": 10,
            "num_workers": 4,
        },
    )

    val_data = coco_detection_yolo_format_val(
        dataset_params={
            "data_dir": dataset_params["data_dir"],
            "images_dir": dataset_params["val_images_dir"],
            "labels_dir": dataset_params["val_labels_dir"],
            "classes": dataset_params["classes"],
        },
        dataloader_params={
            "batch_size": 10,
            "num_workers": 4,
            "shuffle": True,
            "pin_memory": True,
        },
    )

    return train_data, val_data

def train_func(config):
    # list of the possible classes
    classes = [
        "drone",
        "Herc",
        "Bird",
        "FixedWing",
        "Person",
        "Aircraft",
        "Helicopter",
        "FighterJet",
    ]
    torch.cuda.empty_cache()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_data, val_data = load_data()

    # set training parameters
    train_params = {
        "silent_mode": False,
        "average_best_models": True,
        "warmup_mode": "linear_epoch_step",
        "warmup_initial_lr": 1e-6,
        "lr_warmup_epochs": 3,
        "initial_lr": config["initial_lr"],
        "lr_mode": "cosine",
        "cosine_final_lr_ratio": 0.1,
        "optimizer": config["optimizer"],
        "optimizer_params": {"weight_decay": config["weight_decay"]},
        "zero_weight_decay_on_bias_and_bn": True,
        "ema": True,
        "ema_params": {"decay": 0.9, "decay_type": "threshold"},
        "max_epochs": config["epochs"],
        "mixed_precision": True,
        "loss": PPYoloELoss(
            use_static_assigner=False,
            num_classes=len(dataset_params["classes"]),
            reg_max=16,
        ),
        "valid_metrics_list": [
            DetectionMetrics_050(
                score_thres=config["confidence_threshold"],
                top_k_predictions=300,
                num_cls=len(dataset_params["classes"]),
                normalize_targets=True,
                post_prediction_callback=PPYoloEPostPredictionCallback(
                    score_threshold=0.01,
                    nms_top_k=1000,
                    max_predictions=300,
                    nms_threshold=0.7,
                ),
            ),
            DetectionMetrics_050_095(
                score_thres=config["confidence_threshold"],
                top_k_predictions=300,
                num_cls=len(dataset_params["classes"]),
                normalize_targets=True,
                post_prediction_callback=PPYoloEPostPredictionCallback(
                    score_threshold=0.01,
                    nms_top_k=1000,
                    max_predictions=300,
                    nms_threshold=0.7,
                ),
            ),
        ],
        "metric_to_watch": "mAP@0.50:0.95",
    }
    model = models.get(
        "yolo_nas_s",
        num_classes=8,
        pretrained_weights="coco",
    )
    CHECKPOINT_DIR = "../../../SageMaker/outputs/logs"
    # create a unique identifier for model versioning
    UI = f'{config["epochs"]}_{config["confidence_threshold"]}_{config["initial_lr"]}_{config["optimizer"]}'
    # for each of the model types specified, train the model
    experiment_name = config["model_to_train"] + "-" + UI
    model = model.to(device)
    trainer = Trainer(experiment_name=experiment_name, ckpt_root_dir=CHECKPOINT_DIR)
    trainer.train(
            model=model,
            training_params=train_params,
            train_loader=train_data,
            valid_loader=val_data,
        )

    metrics = trainer.test(
        model=model,
        test_loader=val_data,
        test_metrics_list=DetectionMetrics_050_095(
                score_thres=config["confidence_threshold"],
                top_k_predictions=300,
                num_cls=len(classes),
                normalize_targets=True,
                post_prediction_callback=PPYoloEPostPredictionCallback(
                score_threshold=0.01,
                nms_top_k=1000,
                max_predictions=300,
                nms_threshold=0.7
            )
        )
    )
    # Return the metric you want to optimize (e.g., mAP@0.50:0.95)
    best_metric = metrics['mAP@0.50:0.95']
    train.report({'mAP@0.50:0.95':best_metric})
    print(best_metric)

def main():
    num_samples=10
    max_num_epochs=10
    config = {
        "initial_lr": tune.loguniform(1e-6, 1e-2),
        "optimizer": tune.choice(["adam", "sgd"]),
        "epochs": tune.randint(1, 20),
        "model_to_train": tune.choice(["yolo_nas_s", "yolo_nas_m", "yolo_nas_l"]),
        "weight_decay": tune.loguniform(1e-6, 1e-3),
        "confidence_threshold": tune.loguniform(0.1, 1),
    }
    scheduler = ASHAScheduler(
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(train_func),
            resources={"cpu": 2, "gpu": 4}
        ),
        tune_config=tune.TuneConfig(
            metric="mAP@0.50:0.95",
            mode="max",
            scheduler=scheduler,
            num_samples=num_samples,
        ),
        param_space=config,
    )

    results = tuner.fit()

    best_result = results.get_best_result("mAP@0.50:0.95", "max")

    print("Best trial config: {}".format(best_result.config))
    print("Best trial final validation loss: {}".format(
        best_result.metrics["mAP@0.50:0.95"]))

# [4] Define a Ray TorchTrainer to launch `train_func` on all workers
# ===================================================================
ray_trainer = TorchTrainer(
    main,
    scaling_config=ScalingConfig(num_workers=4, use_gpu=True),
)
result = ray_trainer.fit()
justinvyu commented 11 months ago

@HannahAlexander You'll want to "tune" over the "distributed trainer".

trainer = TorchTrainer(
    ..., scaling_config=ray.train.ScalingConfig(num_workers=4, use_gpu=True
)
tuner = ray.tune.Tuner(
    trainer,
    param_space={"train_loop_config": {"lr": tune.grid_search([0.1, 0.01, 0.001])}}
)
tuner.fit()  # 3 x 4 GPUs needed to run all trials concurrently

See this user guide for more explanation: https://docs.ray.io/en/latest/train/user-guides/hyperparameter-optimization.html

HannahAlexander commented 11 months ago

Hi, Thank you for your reply! I've implemented the above which seemed to do what I was hoping for, but it caused a memory error. After further inspection it appears that its launching all 4 GPUS but then only training over 1. image image

def train_func(config):
    # Define Parameters
    super_gradients.init_trainer()
    early_stop_map = EarlyStop(Phase.VALIDATION_EPOCH_END, monitor="mAP@0.50:0.95", mode="max", patience=3, verbose=True)

    # list of the possible classes
    classes = [
        "drone",
        "Herc",
        "Bird",
        "FixedWing",
        "Person",
        "Aircraft",
        "Helicopter",
        "FighterJet",
    ]
    torch.cuda.empty_cache()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #train_data, val_data = load_data()
    classes = [
        "drone",
        "Herc",
        "Bird",
        "FixedWing",
        "Person",
        "Aircraft",
        "Helicopter",
        "FighterJet",
    ]

    dataset_params = {
        "data_dir": "../../../SageMaker/data/sample_data_small",  # root directory of data
        "train_images_dir": "images/train/",  # train images
        "train_labels_dir": "labels/train/",  # train labels
        "val_images_dir": "images/valid/",  # validation images
        "val_labels_dir": "labels/valid/",  # validation labels
        "test_images_dir": "images/test/",  # test images
        "test_labels_dir": "labels/test/",  # test labels
        "classes": classes,
    }

    # create dataloaders for yolonas model
    train_data = coco_detection_yolo_format_train(
        dataset_params={
            "data_dir": dataset_params["data_dir"],
            "images_dir": dataset_params["train_images_dir"],
            "labels_dir": dataset_params["train_labels_dir"],
            "classes": dataset_params["classes"],
        },
        dataloader_params={
            "shuffle": True,
            "pin_memory": True,
            "batch_size": config["batch_size"],
            "num_workers": 4,
        },
    )

    val_data = coco_detection_yolo_format_val(
        dataset_params={
            "data_dir": dataset_params["data_dir"],
            "images_dir": dataset_params["val_images_dir"],
            "labels_dir": dataset_params["val_labels_dir"],
            "classes": dataset_params["classes"],
        },
        dataloader_params={
            "batch_size": config["batch_size"],
            "num_workers": 4,
            "shuffle": True,
            "pin_memory": True,
        },
    )

    # set training parameters
    train_params = {
        "silent_mode": False,
        "average_best_models": True,
        "warmup_mode": "linear_epoch_step",
        "warmup_initial_lr": 1e-6,
        "lr_warmup_epochs": 3,
        "initial_lr": config["initial_lr"],
        "lr_mode": "cosine",
        "cosine_final_lr_ratio": 0.1,
        "optimizer": config["optimizer"],
        "optimizer_params": {"weight_decay": config["weight_decay"]},
        "zero_weight_decay_on_bias_and_bn": True,
        "ema": True,
        "ema_params": {"decay": 0.9, "decay_type": "threshold"},
        "max_epochs": config["epochs"],
        "mixed_precision": True,
        "loss": PPYoloELoss(
            use_static_assigner=False,
            num_classes=len(dataset_params["classes"]),
            reg_max=16,
        ),
        "valid_metrics_list": [
            DetectionMetrics_050(
                score_thres=config["confidence_threshold"],
                top_k_predictions=30,
                num_cls=len(dataset_params["classes"]),
                normalize_targets=True,
                post_prediction_callback=PPYoloEPostPredictionCallback(
                    score_threshold=0.01,
                    nms_top_k=100,
                    max_predictions=30,
                    nms_threshold=0.7,
                ),
            ),
            DetectionMetrics_050_095(
                score_thres=config["confidence_threshold"],
                top_k_predictions=30,
                num_cls=len(dataset_params["classes"]),
                normalize_targets=True,
                post_prediction_callback=PPYoloEPostPredictionCallback(
                    score_threshold=0.01,
                    nms_top_k=100,
                    max_predictions=30,
                    nms_threshold=0.7,
                ),
            ),
        ],
        "metric_to_watch": "mAP@0.50:0.95",
        "phase_callbacks": [early_stop_map]
    }
    model = models.get(
        config["model_to_train"],
        num_classes=8,
        pretrained_weights="coco",
    )
    CHECKPOINT_DIR = "../../../SageMaker/outputs/logs"
    # create a unique identifier for model versioning
    UI = f'{config["epochs"]}_{config["confidence_threshold"]}_{config["initial_lr"]}_{config["optimizer"]}'
    # for each of the model types specified, train the model
    experiment_name = config["model_to_train"] + "-" + UI
    model = model.to(device)
    trainer = Trainer(experiment_name=experiment_name, ckpt_root_dir=CHECKPOINT_DIR)
    trainer.train(
            model=model,
            training_params=train_params,
            train_loader=train_data,
            valid_loader=val_data,
        )

    metrics = trainer.test(
        model=model,
        test_loader=val_data,
        test_metrics_list=DetectionMetrics_050_095(
                score_thres=config["confidence_threshold"],
                top_k_predictions=30,
                num_cls=len(classes),
                normalize_targets=True,
                post_prediction_callback=PPYoloEPostPredictionCallback(
                score_threshold=0.01,
                nms_top_k=100,
                max_predictions=30,
                nms_threshold=0.7
            )
        )
    )

    # Return the metric you want to optimize (e.g., mAP@0.50:0.95)
    best_metric = metrics['mAP@0.50:0.95']
    train.report({'mAP@0.50:0.95':best_metric})
    print(best_metric)

config = {
        "initial_lr": tune.loguniform(1e-6, 1e-2),
        "optimizer": tune.choice(["adam", "sgd"]),
        "epochs": tune.randint(1, 2),
        "model_to_train": tune.choice(["yolo_nas_s", "yolo_nas_m", "yolo_nas_l"]),
        "weight_decay": tune.loguniform(1e-6, 1e-3),
        "confidence_threshold": tune.loguniform(0.1, 1),
        "batch_size": tune.randint(1, 16),
    }

trainer = TorchTrainer(
    train_func,
    scaling_config=ScalingConfig(num_workers=4, use_gpu=True),
)
tuner = ray.tune.Tuner(
    trainer,
    param_space={"train_loop_config": config},
    tune_config=ray.tune.TuneConfig(
        search_alg=OptunaSearch(),
        num_samples=10,
        metric="mAP@0.50:0.95",
        mode="max",
        reuse_actors = True,
    ),
)
results = tuner.fit()
best_result = results.get_best_result("mAP@0.50:0.95", "max")
print("Best trial config: {}".format(best_result.config))
print(best_result)

Could you please explain why this is happening and how i might be able to fix it?

Thank you,

Hannah

justinvyu commented 9 months ago

You should use the device set by Ray Train rather than the default CUDA device:

-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+device = ray.train.torch.get_device() if torch.cuda.is_available() else torch.device("cpu")