tune.run() only accessing 2 GPUs when I have 4 available

HannahAlexander commented 10 months ago

What happened + What you expected to happen

I'm using ray to tune the hyperparameters of a yolonas model. When i set resources_per_trial={'gpu': 2} it will run the trials over 2 GPUs, but when i add any more it will only run on one. I'm using a compute instance on AWS which has 4 GPUs. When I train the model without hyperparameter tuning it is distributed over all 4 GPUS.

Can anyone see what the cause of the issue might be?

Thank you

Versions / Dependencies

I am using ray 2.8.0

Reproduction script

def train_model(config):

CHECKPOINT_DIR = "../../../SageMaker/outputs/logs"
# list of the possible classes
classes = [
    "drone",
    "Herc",
    "Bird",
    "FixedWing",
    "Person",
    "Aircraft",
    "Helicopter",
    "FighterJet",
]

dataset_params = {
    "data_dir": "../../../SageMaker//data/sample_datal",  # root directory of data
    "train_images_dir": "images/train/",  # train images
    "train_labels_dir": "labels/train/",  # train labels
    "val_images_dir": "images/valid/",  # validation images
    "val_labels_dir": "labels/valid/",  # validation labels
    "test_images_dir": "images/test/",  # test images
    "test_labels_dir": "labels/test/",  # test labels
    "classes": classes,
}

# create dataloaders for yolonas model
train_data = coco_detection_yolo_format_train(
    dataset_params={
        "data_dir": dataset_params["data_dir"],
        "images_dir": dataset_params["train_images_dir"],
        "labels_dir": dataset_params["train_labels_dir"],
        "classes": dataset_params["classes"],
    },
    dataloader_params={
        "shuffle": True,
        "pin_memory": True,
        "batch_size": config["batch_size"],
        "num_workers": 4,
    },
)

val_data = coco_detection_yolo_format_val(
    dataset_params={
        "data_dir": dataset_params["data_dir"],
        "images_dir": dataset_params["val_images_dir"],
        "labels_dir": dataset_params["val_labels_dir"],
        "classes": dataset_params["classes"],
    },
    dataloader_params={
        "batch_size": config["batch_size"],
        "num_workers": 4,
        "shuffle": True,
        "pin_memory": True,
    },
)

# set training parameters
train_params = {
    "silent_mode": False,
    "average_best_models": True,
    "warmup_mode": "linear_epoch_step",
    "warmup_initial_lr": 1e-6,
    "lr_warmup_epochs": 3,
    "initial_lr": config["initial_lr"],
    "lr_mode": "cosine",
    "cosine_final_lr_ratio": 0.1,
    "optimizer": config["optimizer"],
    "optimizer_params": {"weight_decay": config["weight_decay"]},
    "zero_weight_decay_on_bias_and_bn": True,
    "ema": True,
    "ema_params": {"decay": 0.9, "decay_type": "threshold"},
    "max_epochs": config["epochs"],
    "mixed_precision": True,
    "loss": PPYoloELoss(
        use_static_assigner=False,
        num_classes=len(dataset_params["classes"]),
        reg_max=16,
    ),
    "valid_metrics_list": [
        DetectionMetrics_050(
            score_thres=config["confidence_threshold"],
            top_k_predictions=300,
            num_cls=len(dataset_params["classes"]),
            normalize_targets=True,
            post_prediction_callback=PPYoloEPostPredictionCallback(
                score_threshold=0.01,
                nms_top_k=1000,
                max_predictions=300,
                nms_threshold=0.7,
            ),
        ),
        DetectionMetrics_050_095(
            score_thres=config["confidence_threshold"],
            top_k_predictions=300,
            num_cls=len(dataset_params["classes"]),
            normalize_targets=True,
            post_prediction_callback=PPYoloEPostPredictionCallback(
                score_threshold=0.01,
                nms_top_k=1000,
                max_predictions=300,
                nms_threshold=0.7,
            ),
        ),
    ],
    "metric_to_watch": "mAP@0.50:0.95",
}

# create a unique identifier for model versioning
UI = f'{config["epochs"]}_{config["batch_size"]}_{config["confidence_threshold"]}_{config["initial_lr"]}_{config["optimizer"]}'
# for each of the model types specified, train the model
experiment_name = config["model_to_train"] + "-" + UI
trainer = Trainer(experiment_name=experiment_name, ckpt_root_dir=CHECKPOINT_DIR)

model = models.get(
    config["model_to_train"],
    num_classes=len(dataset_params["classes"]),
    pretrained_weights="coco",
)

model = model.to(device)

trainer.train(
    model=model,
    training_params=train_params,
    train_loader=train_data,
    valid_loader=val_data,
)

metrics = trainer.test(
    model=model,
    test_loader=val_data,
    test_metrics_list=DetectionMetrics_050_095(
            score_thres=config["confidence_threshold"],
            top_k_predictions=300,
            num_cls=len(classes),
            normalize_targets=True,
            post_prediction_callback=PPYoloEPostPredictionCallback(
            score_threshold=0.01,
            nms_top_k=1000,
            max_predictions=300,
            nms_threshold=0.1
        )
    )
)
# Return the metric you want to optimize (e.g., mAP@0.50:0.95)
best_metric = metrics['mAP@0.50:0.95']
train.report({'mAP@0.50:0.95':best_metric})
print(best_metric)

if name == "main": os.environ["WORLD_SIZE"] = "-1" os.environ["RANK"] = "-1" os.environ["SLURM_JOB_NAME"] = "bash"

# empty the memory cache used by the CUDA memory allocator in PyTorch when working with GPU
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Ensure Ray is initialized
ray.init()

config = {
    "initial_lr": tune.loguniform(1e-6, 1e-2),
    "optimizer": tune.choice(["adam", "sgd"]),
    "batch_size": tune.randint(1, 16),
    "epochs": tune.randint(1, 5),
    "model_to_train": tune.choice(["yolo_nas_s", "yolo_nas_m", "yolo_nas_l"]),
    "weight_decay": tune.loguniform(1e-6, 1e-3),
    "confidence_threshold": tune.loguniform(0.1, 1),
}

analysis = tune.run(
    train_model, config=config, resources_per_trial={'gpu': 3}, search_alg = "optuna", metric="mAP@0.50:0.95", mode="max", num_samples = 4)

# Retrieve the best trial and configuration
best_trial = analysis.get_best_trial()
# Access information from the best trial
best_trial_id = best_trial.trial_id
best_trial_config = best_trial.config
best_trial_last_result = best_trial.last_result

# Print or use the information as needed
print("Best Trial ID:", best_trial_id)
print("Best Trial Config:", best_trial_config)
print("Best Trial Last Result:", best_trial_last_result)

Issue Severity

High: It blocks me from completing my task.

justinvyu commented 10 months ago

@HannahAlexander You should use Ray Train with Huggingface Transformers, rather than Ray Tune with multiple GPUs per trial.

Why is this?

Ray Tune launches a single process (ray actor) to run your custom training script. Ray Tune's main functionality is parallelizing many single process runs. (ex: many non-distributed training runs). Because there's just a single process with no torch distributed environment set up, HF transformers will just use a single device (of the 2 that are available from your resource specification).
Ray Train launches multiple worker processes to run your training script across multiple devices and set up the distributed communication backend so that they can synchronize gradients with each other.
It's also possible to Tune over Train runs if that's also why you were looking into Ray Tune originally!

Take a look here to get started: https://docs.ray.io/en/master/train/getting-started-transformers.html

I'll close this for now, but feel free to follow up with any more questions, or create a new issue if you run into any problems getting onboarded!

HannahAlexander commented 10 months ago

Hi,

Thank you so much for your reply!

I've tried following the tutorial, but I'm coming into an error when i run trainer.train(): TypeError: 'DataLoader' object is not subscriptable. It appears that the data type DataLoader that i need to use for my object detection model isnt compatible? Do you know a way around this? Please find my updated code below for reference:

def train_func():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    config = {
        "initial_lr": 1e-2,
        "optimizer": "sgd",
        "batch_size": 16,
        "epochs": 2,
        "model_to_train": "yolo_nas_s",
        "weight_decay": 1e-6,
        "confidence_threshold": 0.9,
    }

    CHECKPOINT_DIR = "../../../SageMaker/outputs/logs"
    # list of the possible classes
    classes = [
        "drone",
        "Herc",
        "Bird",
        "FixedWing",
        "Person",
        "Aircraft",
        "Helicopter",
        "FighterJet",
    ]

    dataset_params = {
        "data_dir": "../../../SageMaker/sample_data_small",  # root directory of data
        "train_images_dir": "images/train/",  # train images
        "train_labels_dir": "labels/train/",  # train labels
        "val_images_dir": "images/valid/",  # validation images
        "val_labels_dir": "labels/valid/",  # validation labels
        "test_images_dir": "images/test/",  # test images
        "test_labels_dir": "labels/test/",  # test labels
        "classes": classes,
    }

    # create dataloaders for yolonas model
    train_data = coco_detection_yolo_format_train(
        dataset_params={
            "data_dir": dataset_params["data_dir"],
            "images_dir": dataset_params["train_images_dir"],
            "labels_dir": dataset_params["train_labels_dir"],
            "classes": dataset_params["classes"],
        },
        dataloader_params={
            "shuffle": True,
            "pin_memory": True,
            "batch_size": config["batch_size"],
            "num_workers": 4,
        },
    )

    val_data = coco_detection_yolo_format_val(
        dataset_params={
            "data_dir": dataset_params["data_dir"],
            "images_dir": dataset_params["val_images_dir"],
            "labels_dir": dataset_params["val_labels_dir"],
            "classes": dataset_params["classes"],
        },
        dataloader_params={
            "batch_size": config["batch_size"],
            "num_workers": 4,
            "shuffle": True,
            "pin_memory": True,
        },
    )

    # set training parameters
    train_params = {
        "silent_mode": False,
        "average_best_models": True,
        "warmup_mode": "linear_epoch_step",
        "warmup_initial_lr": 1e-6,
        "lr_warmup_epochs": 3,
        "initial_lr": config["initial_lr"],
        "lr_mode": "cosine",
        "cosine_final_lr_ratio": 0.1,
        "optimizer": config["optimizer"],
        "optimizer_params": {"weight_decay": config["weight_decay"]},
        "zero_weight_decay_on_bias_and_bn": True,
        "ema": True,
        "ema_params": {"decay": 0.9, "decay_type": "threshold"},
        "max_epochs": config["epochs"],
        "mixed_precision": True,
        "loss": PPYoloELoss(
            use_static_assigner=False,
            num_classes=len(dataset_params["classes"]),
            reg_max=16,
        ),
        "valid_metrics_list": [
            DetectionMetrics_050(
                score_thres=config["confidence_threshold"],
                top_k_predictions=300,
                num_cls=len(dataset_params["classes"]),
                normalize_targets=True,
                post_prediction_callback=PPYoloEPostPredictionCallback(
                    score_threshold=0.01,
                    nms_top_k=1000,
                    max_predictions=300,
                    nms_threshold=0.7,
                ),
            ),
            DetectionMetrics_050_095(
                score_thres=config["confidence_threshold"],
                top_k_predictions=300,
                num_cls=len(dataset_params["classes"]),
                normalize_targets=True,
                post_prediction_callback=PPYoloEPostPredictionCallback(
                    score_threshold=0.01,
                    nms_top_k=1000,
                    max_predictions=300,
                    nms_threshold=0.7,
                ),
            ),
        ],
        "metric_to_watch": "mAP@0.50:0.95",
    }

    model = models.get(
        config["model_to_train"],
        num_classes=len(dataset_params["classes"]),
        pretrained_weights="coco",
    )

    model = model.to(device)

    # Hugging Face Trainer
    training_args = TrainingArguments(
        output_dir="test_trainer",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        report_to="none",
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        #compute_metrics=compute_metrics,
    )

    # [2] Report Metrics and Checkpoints to Ray Train
    # ===============================================
    callback = ray.train.huggingface.transformers.RayTrainReportCallback()
    trainer.add_callback(callback)
    # [3] Prepare Transformers Trainer
    # ================================
    trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)

    trainer.train() # ERROR OCCURS HERE

    metrics = trainer.test(
        model=model,
        test_loader=val_data,
        test_metrics_list=DetectionMetrics_050_095(
                score_thres=config["confidence_threshold"],
                top_k_predictions=300,
                num_cls=len(classes),
                normalize_targets=True,
                post_prediction_callback=PPYoloEPostPredictionCallback(
                score_threshold=0.01,
                nms_top_k=1000,
                max_predictions=300,
                nms_threshold=0.7
            )
        )
    )

# defines the number of distributed training workers and whether to use GPUs.
scaling_config = ScalingConfig(num_workers=4, use_gpu=True)

# [4] Define a Ray TorchTrainer to launch `train_func` on all workers
# ===================================================================
ray_trainer = TorchTrainer(
    train_func,
    scaling_config=ScalingConfig(num_workers=4, use_gpu=True),
    # [4a] If running in a multi-node cluster, this is where you
    # should configure the run's persistent storage.
    #run_config=ray.train.RunConfig(storage_path="../../../SageMaker/FS-Air-Classification/outputs"),
)
result = ray_trainer.fit()

justinvyu commented 10 months ago

@HannahAlexander It looks like you're trying to pass a DataLoader as the argument to train_datset in the transformers trainer. The Trainer only takes in a torch.utils.data.Dataset or torch.utils.data.IterableDataset. See here.

If you really need to pass in a torch DataLoader, it looks like transformers lets you override this get_train_dataloader method of the Trainer class: https://huggingface.co/docs/transformers/v4.36.0/en/main_classes/trainer#transformers.Trainer.get_train_dataloader

ray-project / ray