Help with multi-GPU training #5314

Open Ayadx opened 1 week ago

Ayadx commented 1 week ago

Hi, am trying to use multi-GPU training using kaggle with two Tesla T4. my code only runs on 1 GPU, the other are not utilized. I am able to train with custom dataset and getting acceptable results, but wish to use 2 GPUs for faster training.

i am using this but is not working: "python -m torch.distributed.launch --nproc_per_node=2 train_yolo.py"

Full runnable code or full changes you made:

` import os import json import multiprocessing as mp from detectron2.engine import DefaultTrainer, HookBase from detectron2.config import get_cfg from detectron2 import model_zoo from detectron2.evaluation import COCOEvaluator, inference_on_dataset from detectron2.data import build_detection_test_loader, DatasetCatalog, MetadataCatalog from detectron2.structures import BoxMode

Define the training script content

script_content = """ import os import json import multiprocessing as mp from detectron2.engine import DefaultTrainer, HookBase from detectron2.config import get_cfg from detectron2 import model_zoo from detectron2.evaluation import COCOEvaluator, inference_on_dataset from detectron2.data import build_detection_test_loader, DatasetCatalog, MetadataCatalog from detectron2.structures import BoxMode

Unregister the datasets if they are already registered

for d in ["pv_anomaly_train", "pv_anomaly_val", "pv_anomaly_test"]: if d in DatasetCatalog.list(): DatasetCatalog.remove(d) if d in MetadataCatalog.list(): MetadataCatalog.remove(d)

def load_coco_json(json_file, image_root, dataset_name): with open(json_file) as f: imgs_anns = json.load(f)

dataset_dicts = []
for img_ann in imgs_anns["images"]:
    record = {}
    record["file_name"] = os.path.join(image_root, img_ann["file_name"])
    record["image_id"] = img_ann["id"]
    record["height"] = img_ann["height"]
    record["width"] = img_ann["width"]

    objs = []
    for ann in imgs_anns["annotations"]:
        if ann["image_id"] != img_ann["id"]:
        obj = {
            "bbox": ann["bbox"],
            "bbox_mode": BoxMode.XYWH_ABS,
            "category_id": ann["category_id"] - 1,  # Subtract 1 to make the category_id 0-based
            "iscrowd": ann["iscrowd"]
    record["annotations"] = objs
return dataset_dicts

def register_datasets(): DatasetCatalog.register( "pv_anomaly_train", lambda: load_coco_json( "/kaggle/working/0PVProjects/Univpm_DataSet/labels/train_annotations.json", "/kaggle/working/0PVProjects/Univpm_DataSet/images/train_combined_data", "pv_anomaly_train" ) ) MetadataCatalog.get("pv_anomaly_train").set(thing_classes=["anomaly"])

    lambda: load_coco_json(

    lambda: load_coco_json(

# Retrieve metadata to ensure it is set correctly
pv_anomaly_metadata = MetadataCatalog.get("pv_anomaly_train")
pv_anomaly_metadata1 = MetadataCatalog.get("pv_anomaly_val")

def set_multiprocessing_start_method(): try: mp.set_start_method('spawn', force=True) except RuntimeError as e: if "context has already been set" in str(e): print("Multiprocessing context already set, continuing without changing start method.") else: raise

class PrintMetricsHook(HookBase): def init(self, cfg): self.cfg = cfg

def after_step(self):
    # Every iteration
    iter_num = self.trainer.iter
    # Print the metrics for every iteration
    metrics = self.trainer.storage.latest()

    # Format and print the metrics
    print(f"\\nIteration {iter_num} Metrics:")

    for key, (value, _) in metrics.items():
        print(f"{key}: {value:.4f}")


class MyTrainer(DefaultTrainer): @classmethod def build_evaluator(cls, cfg, dataset_name): return COCOEvaluator(dataset_name, cfg, False, output_dir=cfg.OUTPUT_DIR)

def main(): register_datasets() set_multiprocessing_start_method()

# Get default config
cfg = get_cfg()

# Load Faster R-CNN with "ResNeXt-101-32x8d model trained with Caffe2 at FB" backbone pre-trained on COCO

# Set training and validation datasets
cfg.DATASETS.TRAIN = ("pv_anomaly_train",)  # Training dataset
cfg.DATASETS.TEST = ("pv_anomaly_val",)     # Validation dataset

# Number of data loading workers

# Set weights for pre-trained model
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml")

# Number of images per batch
cfg.SOLVER.IMS_PER_BATCH = 2  # Reduced batch size to fit into memory

# Base learning rate
cfg.SOLVER.BASE_LR = 0.00025

# Maximum number of iterations in detectron2, epoch is MAX_ITER * BATCH_SIZE / TOTAL_NUM_IMAGES
cfg.SOLVER.MAX_ITER = 3000

# ROI Heads batch size per image

# Number of classes (in this case, only 1: 'anomaly')

# Create output directory if it doesn't exist
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

# Set the maximum split size to avoid fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

# Create a MyTrainer instance
trainer = MyTrainer(cfg)

# Add the custom hook to print metrics after each iteration

# Resume training if checkpoint exists, otherwise start from scratch

# Start training

# Run evaluation
evaluator = COCOEvaluator("pv_anomaly_val", cfg, False, output_dir=cfg.OUTPUT_DIR)
val_loader = build_detection_test_loader(cfg, "pv_anomaly_val")
inference_on_dataset(trainer.model, val_loader, evaluator)

if name == "main": main() """

Write the script to a file

script_path = '/kaggle/working/train_yolo.py' with open(script_path, 'w') as f: f.write(script_content)

Define the command to run the training script using torch.distributed.run

train_command = f""" python -m torch.distributed.run --nproc_per_node=2 {script_path} """

Execute the training command


` best regards!

Programmer-RD-AI commented 1 week ago

Hi, Check the Issues #2442 & #2473 Example on How to implement multi GPU training Hope this helps, If there are further questions please feel free to comment :) Best regards, Ranuga