Ayadx commented 1 week ago

Hi, am trying to use multi-GPU training using kaggle with two Tesla T4. my code only runs on 1 GPU, the other are not utilized. I am able to train with custom dataset and getting acceptable results, but wish to use 2 GPUs for faster training.

i am using this but is not working: "python -m torch.distributed.launch --nproc_per_node=2 train_yolo.py"

Full runnable code or full changes you made:

` import os import json import multiprocessing as mp from detectron2.engine import DefaultTrainer, HookBase from detectron2.config import get_cfg from detectron2 import model_zoo from detectron2.evaluation import COCOEvaluator, inference_on_dataset from detectron2.data import build_detection_test_loader, DatasetCatalog, MetadataCatalog from detectron2.structures import BoxMode

Define the training script content

script_content = """ import os import json import multiprocessing as mp from detectron2.engine import DefaultTrainer, HookBase from detectron2.config import get_cfg from detectron2 import model_zoo from detectron2.evaluation import COCOEvaluator, inference_on_dataset from detectron2.data import build_detection_test_loader, DatasetCatalog, MetadataCatalog from detectron2.structures import BoxMode

Unregister the datasets if they are already registered

for d in ["pv_anomaly_train", "pv_anomaly_val", "pv_anomaly_test"]: if d in DatasetCatalog.list(): DatasetCatalog.remove(d) if d in MetadataCatalog.list(): MetadataCatalog.remove(d)

def load_coco_json(json_file, image_root, dataset_name): with open(json_file) as f: imgs_anns = json.load(f)

dataset_dicts = []
for img_ann in imgs_anns["images"]:
    record = {}
    record["file_name"] = os.path.join(image_root, img_ann["file_name"])
    record["image_id"] = img_ann["id"]
    record["height"] = img_ann["height"]
    record["width"] = img_ann["width"]

    objs = []
    for ann in imgs_anns["annotations"]:
        if ann["image_id"] != img_ann["id"]:
            continue
        obj = {
            "bbox": ann["bbox"],
            "bbox_mode": BoxMode.XYWH_ABS,
            "category_id": ann["category_id"] - 1,  # Subtract 1 to make the category_id 0-based
            "iscrowd": ann["iscrowd"]
        }
        objs.append(obj)
    record["annotations"] = objs
    dataset_dicts.append(record)
return dataset_dicts

def register_datasets(): DatasetCatalog.register( "pv_anomaly_train", lambda: load_coco_json( "/kaggle/working/0PVProjects/Univpm_DataSet/labels/train_annotations.json", "/kaggle/working/0PVProjects/Univpm_DataSet/images/train_combined_data", "pv_anomaly_train" ) ) MetadataCatalog.get("pv_anomaly_train").set(thing_classes=["anomaly"])

DatasetCatalog.register(
    "pv_anomaly_val",
    lambda: load_coco_json(
        "/kaggle/working/0PVProjects/Univpm_DataSet/labels/val_annotations.json",
        "/kaggle/working/0PVProjects/Univpm_DataSet/images/val",
        "pv_anomaly_val"
    )
)
MetadataCatalog.get("pv_anomaly_val").set(thing_classes=["anomaly"])

DatasetCatalog.register(
    "pv_anomaly_test",
    lambda: load_coco_json(
        "/kaggle/working/0PVProjects/Univpm_DataSet/labels/test_annotations.json",
        "/kaggle/working/0PVProjects/Univpm_DataSet/images/test",
        "pv_anomaly_test"
    )
)
MetadataCatalog.get("pv_anomaly_test").set(thing_classes=["anomaly"])

# Retrieve metadata to ensure it is set correctly
pv_anomaly_metadata = MetadataCatalog.get("pv_anomaly_train")
pv_anomaly_metadata1 = MetadataCatalog.get("pv_anomaly_val")
print(pv_anomaly_metadata)
print(pv_anomaly_metadata1)

def set_multiprocessing_start_method(): try: mp.set_start_method('spawn', force=True) except RuntimeError as e: if "context has already been set" in str(e): print("Multiprocessing context already set, continuing without changing start method.") else: raise

class PrintMetricsHook(HookBase): def init(self, cfg): self.cfg = cfg

def after_step(self):
    # Every iteration
    iter_num = self.trainer.iter
    # Print the metrics for every iteration
    metrics = self.trainer.storage.latest()

    # Format and print the metrics
    print(f"\\nIteration {iter_num} Metrics:")
    print(f"{'-'*40}")

    for key, (value, _) in metrics.items():
        print(f"{key}: {value:.4f}")

    print(f"{'-'*40}")

class MyTrainer(DefaultTrainer): @classmethod def build_evaluator(cls, cfg, dataset_name): return COCOEvaluator(dataset_name, cfg, False, output_dir=cfg.OUTPUT_DIR)

def main(): register_datasets() set_multiprocessing_start_method()

# Get default config
cfg = get_cfg()

# Load Faster R-CNN with "ResNeXt-101-32x8d model trained with Caffe2 at FB" backbone pre-trained on COCO
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml"))

# Set training and validation datasets
cfg.DATASETS.TRAIN = ("pv_anomaly_train",)  # Training dataset
cfg.DATASETS.TEST = ("pv_anomaly_val",)     # Validation dataset

# Number of data loading workers
cfg.DATALOADER.NUM_WORKERS = 2

# Set weights for pre-trained model
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml")

# Number of images per batch
cfg.SOLVER.IMS_PER_BATCH = 2  # Reduced batch size to fit into memory

# Base learning rate
cfg.SOLVER.BASE_LR = 0.00025

# Maximum number of iterations in detectron2, epoch is MAX_ITER * BATCH_SIZE / TOTAL_NUM_IMAGES
cfg.SOLVER.MAX_ITER = 3000

# ROI Heads batch size per image
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128

# Number of classes (in this case, only 1: 'anomaly')
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1

# Create output directory if it doesn't exist
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

# Set the maximum split size to avoid fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

# Create a MyTrainer instance
trainer = MyTrainer(cfg)

# Add the custom hook to print metrics after each iteration
trainer.register_hooks([PrintMetricsHook(cfg)])

# Resume training if checkpoint exists, otherwise start from scratch
trainer.resume_or_load(resume=False)

# Start training
trainer.train()

# Run evaluation
evaluator = COCOEvaluator("pv_anomaly_val", cfg, False, output_dir=cfg.OUTPUT_DIR)
val_loader = build_detection_test_loader(cfg, "pv_anomaly_val")
inference_on_dataset(trainer.model, val_loader, evaluator)

if name == "main": main() """

Write the script to a file

script_path = '/kaggle/working/train_yolo.py' with open(script_path, 'w') as f: f.write(script_content)

Define the command to run the training script using torch.distributed.run

train_command = f""" python -m torch.distributed.run --nproc_per_node=2 {script_path} """

Execute the training command

os.system(train_command)

` best regards!

github-actions[bot] commented 1 week ago

You've chosen to report an unexpected problem or bug. Unless you already know the root cause of it, please include details about it by filling the issue template. The following information is missing: "Instructions To Reproduce the Issue and Full Logs"; "Your Environment";

Programmer-RD-AI commented 1 week ago

Hi, Check the Issues #2442 & #2473 Example on How to implement multi GPU training Hope this helps, If there are further questions please feel free to comment :) Best regards, Ranuga

facebookresearch / detectron2

Help with multi-GPU training #5314

Define the training script content

Unregister the datasets if they are already registered

Write the script to a file

Define the command to run the training script using torch.distributed.run

Execute the training command