Open HannahAlexander opened 11 months ago
@HannahAlexander You'll want to "tune" over the "distributed trainer".
trainer = TorchTrainer(
..., scaling_config=ray.train.ScalingConfig(num_workers=4, use_gpu=True
)
tuner = ray.tune.Tuner(
trainer,
param_space={"train_loop_config": {"lr": tune.grid_search([0.1, 0.01, 0.001])}}
)
tuner.fit() # 3 x 4 GPUs needed to run all trials concurrently
See this user guide for more explanation: https://docs.ray.io/en/latest/train/user-guides/hyperparameter-optimization.html
Hi, Thank you for your reply! I've implemented the above which seemed to do what I was hoping for, but it caused a memory error. After further inspection it appears that its launching all 4 GPUS but then only training over 1.
def train_func(config):
# Define Parameters
super_gradients.init_trainer()
early_stop_map = EarlyStop(Phase.VALIDATION_EPOCH_END, monitor="mAP@0.50:0.95", mode="max", patience=3, verbose=True)
# list of the possible classes
classes = [
"drone",
"Herc",
"Bird",
"FixedWing",
"Person",
"Aircraft",
"Helicopter",
"FighterJet",
]
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#train_data, val_data = load_data()
classes = [
"drone",
"Herc",
"Bird",
"FixedWing",
"Person",
"Aircraft",
"Helicopter",
"FighterJet",
]
dataset_params = {
"data_dir": "../../../SageMaker/data/sample_data_small", # root directory of data
"train_images_dir": "images/train/", # train images
"train_labels_dir": "labels/train/", # train labels
"val_images_dir": "images/valid/", # validation images
"val_labels_dir": "labels/valid/", # validation labels
"test_images_dir": "images/test/", # test images
"test_labels_dir": "labels/test/", # test labels
"classes": classes,
}
# create dataloaders for yolonas model
train_data = coco_detection_yolo_format_train(
dataset_params={
"data_dir": dataset_params["data_dir"],
"images_dir": dataset_params["train_images_dir"],
"labels_dir": dataset_params["train_labels_dir"],
"classes": dataset_params["classes"],
},
dataloader_params={
"shuffle": True,
"pin_memory": True,
"batch_size": config["batch_size"],
"num_workers": 4,
},
)
val_data = coco_detection_yolo_format_val(
dataset_params={
"data_dir": dataset_params["data_dir"],
"images_dir": dataset_params["val_images_dir"],
"labels_dir": dataset_params["val_labels_dir"],
"classes": dataset_params["classes"],
},
dataloader_params={
"batch_size": config["batch_size"],
"num_workers": 4,
"shuffle": True,
"pin_memory": True,
},
)
# set training parameters
train_params = {
"silent_mode": False,
"average_best_models": True,
"warmup_mode": "linear_epoch_step",
"warmup_initial_lr": 1e-6,
"lr_warmup_epochs": 3,
"initial_lr": config["initial_lr"],
"lr_mode": "cosine",
"cosine_final_lr_ratio": 0.1,
"optimizer": config["optimizer"],
"optimizer_params": {"weight_decay": config["weight_decay"]},
"zero_weight_decay_on_bias_and_bn": True,
"ema": True,
"ema_params": {"decay": 0.9, "decay_type": "threshold"},
"max_epochs": config["epochs"],
"mixed_precision": True,
"loss": PPYoloELoss(
use_static_assigner=False,
num_classes=len(dataset_params["classes"]),
reg_max=16,
),
"valid_metrics_list": [
DetectionMetrics_050(
score_thres=config["confidence_threshold"],
top_k_predictions=30,
num_cls=len(dataset_params["classes"]),
normalize_targets=True,
post_prediction_callback=PPYoloEPostPredictionCallback(
score_threshold=0.01,
nms_top_k=100,
max_predictions=30,
nms_threshold=0.7,
),
),
DetectionMetrics_050_095(
score_thres=config["confidence_threshold"],
top_k_predictions=30,
num_cls=len(dataset_params["classes"]),
normalize_targets=True,
post_prediction_callback=PPYoloEPostPredictionCallback(
score_threshold=0.01,
nms_top_k=100,
max_predictions=30,
nms_threshold=0.7,
),
),
],
"metric_to_watch": "mAP@0.50:0.95",
"phase_callbacks": [early_stop_map]
}
model = models.get(
config["model_to_train"],
num_classes=8,
pretrained_weights="coco",
)
CHECKPOINT_DIR = "../../../SageMaker/outputs/logs"
# create a unique identifier for model versioning
UI = f'{config["epochs"]}_{config["confidence_threshold"]}_{config["initial_lr"]}_{config["optimizer"]}'
# for each of the model types specified, train the model
experiment_name = config["model_to_train"] + "-" + UI
model = model.to(device)
trainer = Trainer(experiment_name=experiment_name, ckpt_root_dir=CHECKPOINT_DIR)
trainer.train(
model=model,
training_params=train_params,
train_loader=train_data,
valid_loader=val_data,
)
metrics = trainer.test(
model=model,
test_loader=val_data,
test_metrics_list=DetectionMetrics_050_095(
score_thres=config["confidence_threshold"],
top_k_predictions=30,
num_cls=len(classes),
normalize_targets=True,
post_prediction_callback=PPYoloEPostPredictionCallback(
score_threshold=0.01,
nms_top_k=100,
max_predictions=30,
nms_threshold=0.7
)
)
)
# Return the metric you want to optimize (e.g., mAP@0.50:0.95)
best_metric = metrics['mAP@0.50:0.95']
train.report({'mAP@0.50:0.95':best_metric})
print(best_metric)
config = {
"initial_lr": tune.loguniform(1e-6, 1e-2),
"optimizer": tune.choice(["adam", "sgd"]),
"epochs": tune.randint(1, 2),
"model_to_train": tune.choice(["yolo_nas_s", "yolo_nas_m", "yolo_nas_l"]),
"weight_decay": tune.loguniform(1e-6, 1e-3),
"confidence_threshold": tune.loguniform(0.1, 1),
"batch_size": tune.randint(1, 16),
}
trainer = TorchTrainer(
train_func,
scaling_config=ScalingConfig(num_workers=4, use_gpu=True),
)
tuner = ray.tune.Tuner(
trainer,
param_space={"train_loop_config": config},
tune_config=ray.tune.TuneConfig(
search_alg=OptunaSearch(),
num_samples=10,
metric="mAP@0.50:0.95",
mode="max",
reuse_actors = True,
),
)
results = tuner.fit()
best_result = results.get_best_result("mAP@0.50:0.95", "max")
print("Best trial config: {}".format(best_result.config))
print(best_result)
Could you please explain why this is happening and how i might be able to fix it?
Thank you,
Hannah
You should use the device set by Ray Train rather than the default CUDA device:
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+device = ray.train.torch.get_device() if torch.cuda.is_available() else torch.device("cpu")
Hi, I can train my model over 4 GPUs or i can apply hyperparameter tuning to my model, but I cant seem to do both at the same time. When i try to hyperparameter tune by setting gpus = 4, only 1 gpu is used. In the code below i have tried to combine both methods but it doesnt work because it creates 2 log folders with separate names (based on the current time). Does anyone know how i can achieve the hyperparameter tuning over multiple GPUs?