ray-project / ray

Ray is a unified framework for scaling AI and Python applications. Ray consists of a core distributed runtime and a set of AI Libraries for accelerating ML workloads.
https://ray.io
Apache License 2.0
33.43k stars 5.67k forks source link

tune.run() only accessing 2 GPUs when I have 4 available #41693

Closed HannahAlexander closed 10 months ago

HannahAlexander commented 10 months ago

What happened + What you expected to happen

I'm using ray to tune the hyperparameters of a yolonas model. When i set resources_per_trial={'gpu': 2} it will run the trials over 2 GPUs, but when i add any more it will only run on one. I'm using a compute instance on AWS which has 4 GPUs. When I train the model without hyperparameter tuning it is distributed over all 4 GPUS.

Can anyone see what the cause of the issue might be?

Thank you

Versions / Dependencies

I am using ray 2.8.0

Reproduction script

def train_model(config):

CHECKPOINT_DIR = "../../../SageMaker/outputs/logs"
# list of the possible classes
classes = [
    "drone",
    "Herc",
    "Bird",
    "FixedWing",
    "Person",
    "Aircraft",
    "Helicopter",
    "FighterJet",
]

dataset_params = {
    "data_dir": "../../../SageMaker//data/sample_datal",  # root directory of data
    "train_images_dir": "images/train/",  # train images
    "train_labels_dir": "labels/train/",  # train labels
    "val_images_dir": "images/valid/",  # validation images
    "val_labels_dir": "labels/valid/",  # validation labels
    "test_images_dir": "images/test/",  # test images
    "test_labels_dir": "labels/test/",  # test labels
    "classes": classes,
}

# create dataloaders for yolonas model
train_data = coco_detection_yolo_format_train(
    dataset_params={
        "data_dir": dataset_params["data_dir"],
        "images_dir": dataset_params["train_images_dir"],
        "labels_dir": dataset_params["train_labels_dir"],
        "classes": dataset_params["classes"],
    },
    dataloader_params={
        "shuffle": True,
        "pin_memory": True,
        "batch_size": config["batch_size"],
        "num_workers": 4,
    },
)

val_data = coco_detection_yolo_format_val(
    dataset_params={
        "data_dir": dataset_params["data_dir"],
        "images_dir": dataset_params["val_images_dir"],
        "labels_dir": dataset_params["val_labels_dir"],
        "classes": dataset_params["classes"],
    },
    dataloader_params={
        "batch_size": config["batch_size"],
        "num_workers": 4,
        "shuffle": True,
        "pin_memory": True,
    },
)

# set training parameters
train_params = {
    "silent_mode": False,
    "average_best_models": True,
    "warmup_mode": "linear_epoch_step",
    "warmup_initial_lr": 1e-6,
    "lr_warmup_epochs": 3,
    "initial_lr": config["initial_lr"],
    "lr_mode": "cosine",
    "cosine_final_lr_ratio": 0.1,
    "optimizer": config["optimizer"],
    "optimizer_params": {"weight_decay": config["weight_decay"]},
    "zero_weight_decay_on_bias_and_bn": True,
    "ema": True,
    "ema_params": {"decay": 0.9, "decay_type": "threshold"},
    "max_epochs": config["epochs"],
    "mixed_precision": True,
    "loss": PPYoloELoss(
        use_static_assigner=False,
        num_classes=len(dataset_params["classes"]),
        reg_max=16,
    ),
    "valid_metrics_list": [
        DetectionMetrics_050(
            score_thres=config["confidence_threshold"],
            top_k_predictions=300,
            num_cls=len(dataset_params["classes"]),
            normalize_targets=True,
            post_prediction_callback=PPYoloEPostPredictionCallback(
                score_threshold=0.01,
                nms_top_k=1000,
                max_predictions=300,
                nms_threshold=0.7,
            ),
        ),
        DetectionMetrics_050_095(
            score_thres=config["confidence_threshold"],
            top_k_predictions=300,
            num_cls=len(dataset_params["classes"]),
            normalize_targets=True,
            post_prediction_callback=PPYoloEPostPredictionCallback(
                score_threshold=0.01,
                nms_top_k=1000,
                max_predictions=300,
                nms_threshold=0.7,
            ),
        ),
    ],
    "metric_to_watch": "mAP@0.50:0.95",
}

# create a unique identifier for model versioning
UI = f'{config["epochs"]}_{config["batch_size"]}_{config["confidence_threshold"]}_{config["initial_lr"]}_{config["optimizer"]}'
# for each of the model types specified, train the model
experiment_name = config["model_to_train"] + "-" + UI
trainer = Trainer(experiment_name=experiment_name, ckpt_root_dir=CHECKPOINT_DIR)

model = models.get(
    config["model_to_train"],
    num_classes=len(dataset_params["classes"]),
    pretrained_weights="coco",
)

model = model.to(device)

trainer.train(
    model=model,
    training_params=train_params,
    train_loader=train_data,
    valid_loader=val_data,
)

metrics = trainer.test(
    model=model,
    test_loader=val_data,
    test_metrics_list=DetectionMetrics_050_095(
            score_thres=config["confidence_threshold"],
            top_k_predictions=300,
            num_cls=len(classes),
            normalize_targets=True,
            post_prediction_callback=PPYoloEPostPredictionCallback(
            score_threshold=0.01,
            nms_top_k=1000,
            max_predictions=300,
            nms_threshold=0.1
        )
    )
)
# Return the metric you want to optimize (e.g., mAP@0.50:0.95)
best_metric = metrics['mAP@0.50:0.95']
train.report({'mAP@0.50:0.95':best_metric})
print(best_metric)

if name == "main": os.environ["WORLD_SIZE"] = "-1" os.environ["RANK"] = "-1" os.environ["SLURM_JOB_NAME"] = "bash"

# empty the memory cache used by the CUDA memory allocator in PyTorch when working with GPU
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Ensure Ray is initialized
ray.init()

config = {
    "initial_lr": tune.loguniform(1e-6, 1e-2),
    "optimizer": tune.choice(["adam", "sgd"]),
    "batch_size": tune.randint(1, 16),
    "epochs": tune.randint(1, 5),
    "model_to_train": tune.choice(["yolo_nas_s", "yolo_nas_m", "yolo_nas_l"]),
    "weight_decay": tune.loguniform(1e-6, 1e-3),
    "confidence_threshold": tune.loguniform(0.1, 1),
}

analysis = tune.run(
    train_model, config=config, resources_per_trial={'gpu': 3}, search_alg = "optuna", metric="mAP@0.50:0.95", mode="max", num_samples = 4)

# Retrieve the best trial and configuration
best_trial = analysis.get_best_trial()
# Access information from the best trial
best_trial_id = best_trial.trial_id
best_trial_config = best_trial.config
best_trial_last_result = best_trial.last_result

# Print or use the information as needed
print("Best Trial ID:", best_trial_id)
print("Best Trial Config:", best_trial_config)
print("Best Trial Last Result:", best_trial_last_result)

Issue Severity

High: It blocks me from completing my task.

justinvyu commented 10 months ago

@HannahAlexander You should use Ray Train with Huggingface Transformers, rather than Ray Tune with multiple GPUs per trial.

Why is this?

Take a look here to get started: https://docs.ray.io/en/master/train/getting-started-transformers.html

I'll close this for now, but feel free to follow up with any more questions, or create a new issue if you run into any problems getting onboarded!

HannahAlexander commented 10 months ago

Hi,

Thank you so much for your reply!

I've tried following the tutorial, but I'm coming into an error when i run trainer.train(): TypeError: 'DataLoader' object is not subscriptable. It appears that the data type DataLoader that i need to use for my object detection model isnt compatible? Do you know a way around this? Please find my updated code below for reference:

def train_func():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    config = {
        "initial_lr": 1e-2,
        "optimizer": "sgd",
        "batch_size": 16,
        "epochs": 2,
        "model_to_train": "yolo_nas_s",
        "weight_decay": 1e-6,
        "confidence_threshold": 0.9,
    }

    CHECKPOINT_DIR = "../../../SageMaker/outputs/logs"
    # list of the possible classes
    classes = [
        "drone",
        "Herc",
        "Bird",
        "FixedWing",
        "Person",
        "Aircraft",
        "Helicopter",
        "FighterJet",
    ]

    dataset_params = {
        "data_dir": "../../../SageMaker/sample_data_small",  # root directory of data
        "train_images_dir": "images/train/",  # train images
        "train_labels_dir": "labels/train/",  # train labels
        "val_images_dir": "images/valid/",  # validation images
        "val_labels_dir": "labels/valid/",  # validation labels
        "test_images_dir": "images/test/",  # test images
        "test_labels_dir": "labels/test/",  # test labels
        "classes": classes,
    }

    # create dataloaders for yolonas model
    train_data = coco_detection_yolo_format_train(
        dataset_params={
            "data_dir": dataset_params["data_dir"],
            "images_dir": dataset_params["train_images_dir"],
            "labels_dir": dataset_params["train_labels_dir"],
            "classes": dataset_params["classes"],
        },
        dataloader_params={
            "shuffle": True,
            "pin_memory": True,
            "batch_size": config["batch_size"],
            "num_workers": 4,
        },
    )

    val_data = coco_detection_yolo_format_val(
        dataset_params={
            "data_dir": dataset_params["data_dir"],
            "images_dir": dataset_params["val_images_dir"],
            "labels_dir": dataset_params["val_labels_dir"],
            "classes": dataset_params["classes"],
        },
        dataloader_params={
            "batch_size": config["batch_size"],
            "num_workers": 4,
            "shuffle": True,
            "pin_memory": True,
        },
    )

    # set training parameters
    train_params = {
        "silent_mode": False,
        "average_best_models": True,
        "warmup_mode": "linear_epoch_step",
        "warmup_initial_lr": 1e-6,
        "lr_warmup_epochs": 3,
        "initial_lr": config["initial_lr"],
        "lr_mode": "cosine",
        "cosine_final_lr_ratio": 0.1,
        "optimizer": config["optimizer"],
        "optimizer_params": {"weight_decay": config["weight_decay"]},
        "zero_weight_decay_on_bias_and_bn": True,
        "ema": True,
        "ema_params": {"decay": 0.9, "decay_type": "threshold"},
        "max_epochs": config["epochs"],
        "mixed_precision": True,
        "loss": PPYoloELoss(
            use_static_assigner=False,
            num_classes=len(dataset_params["classes"]),
            reg_max=16,
        ),
        "valid_metrics_list": [
            DetectionMetrics_050(
                score_thres=config["confidence_threshold"],
                top_k_predictions=300,
                num_cls=len(dataset_params["classes"]),
                normalize_targets=True,
                post_prediction_callback=PPYoloEPostPredictionCallback(
                    score_threshold=0.01,
                    nms_top_k=1000,
                    max_predictions=300,
                    nms_threshold=0.7,
                ),
            ),
            DetectionMetrics_050_095(
                score_thres=config["confidence_threshold"],
                top_k_predictions=300,
                num_cls=len(dataset_params["classes"]),
                normalize_targets=True,
                post_prediction_callback=PPYoloEPostPredictionCallback(
                    score_threshold=0.01,
                    nms_top_k=1000,
                    max_predictions=300,
                    nms_threshold=0.7,
                ),
            ),
        ],
        "metric_to_watch": "mAP@0.50:0.95",
    }

    model = models.get(
        config["model_to_train"],
        num_classes=len(dataset_params["classes"]),
        pretrained_weights="coco",
    )

    model = model.to(device)

    # Hugging Face Trainer
    training_args = TrainingArguments(
        output_dir="test_trainer",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        report_to="none",
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        #compute_metrics=compute_metrics,
    )

    # [2] Report Metrics and Checkpoints to Ray Train
    # ===============================================
    callback = ray.train.huggingface.transformers.RayTrainReportCallback()
    trainer.add_callback(callback)
    # [3] Prepare Transformers Trainer
    # ================================
    trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)

    trainer.train() # ERROR OCCURS HERE

    metrics = trainer.test(
        model=model,
        test_loader=val_data,
        test_metrics_list=DetectionMetrics_050_095(
                score_thres=config["confidence_threshold"],
                top_k_predictions=300,
                num_cls=len(classes),
                normalize_targets=True,
                post_prediction_callback=PPYoloEPostPredictionCallback(
                score_threshold=0.01,
                nms_top_k=1000,
                max_predictions=300,
                nms_threshold=0.7
            )
        )
    )

# defines the number of distributed training workers and whether to use GPUs.
scaling_config = ScalingConfig(num_workers=4, use_gpu=True)

# [4] Define a Ray TorchTrainer to launch `train_func` on all workers
# ===================================================================
ray_trainer = TorchTrainer(
    train_func,
    scaling_config=ScalingConfig(num_workers=4, use_gpu=True),
    # [4a] If running in a multi-node cluster, this is where you
    # should configure the run's persistent storage.
    #run_config=ray.train.RunConfig(storage_path="../../../SageMaker/FS-Air-Classification/outputs"),
)
result = ray_trainer.fit()
justinvyu commented 10 months ago

@HannahAlexander It looks like you're trying to pass a DataLoader as the argument to train_datset in the transformers trainer. The Trainer only takes in a torch.utils.data.Dataset or torch.utils.data.IterableDataset. See here.

If you really need to pass in a torch DataLoader, it looks like transformers lets you override this get_train_dataloader method of the Trainer class: https://huggingface.co/docs/transformers/v4.36.0/en/main_classes/trainer#transformers.Trainer.get_train_dataloader