lightly-ai / lightly

A python library for self-supervised learning on images.
https://docs.lightly.ai/self-supervised-learning/
MIT License
2.83k stars 246 forks source link

Detectron2 Pretrained Base-RCNN-FPN, training diverged issue #1546

Closed ronit450 closed 4 days ago

ronit450 commented 1 month ago

I am using tutorial 6 for traning a detectron2 backbone with my custom dataset and then using that to my detectron2 training script. But at this point the training gets diverged and in the first epochs the loss value is too high.

here is my Lightly SSL training script import sys import os sys.path.insert(0, os.path.abspath("./detectron2")) os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

import torch from detectron2 import config, modeling from detectron2.checkpoint import DetectionCheckpointer from lightly.data import LightlyDataset from lightly.loss import NTXentLoss from lightly.models.modules import SimCLRProjectionHead from lightly.transforms import SimCLRTransform

class SelectStage(torch.nn.Module): """Selects features from a given stage.""" def init(self, stage: str = "res5"): super().init() self.stage = stage

def forward(self, x):
    return x[self.stage]

class Detectron2LightlyTrainer: def init(self, data_path, cfg_path, device, batch_size, input_size, num_ftrs, max_epochs, lr): self.data_path = data_path self.cfg_path = cfg_path self.device = device self.batch_size = batch_size self.input_size = input_size self.num_ftrs = num_ftrs self.max_epochs = max_epochs self.model = None self.dataloader = None self.simclr_backbone = None self.projection_head = None self.lr = lr

def setup_model(self):
    cfg = config.get_cfg()
    cfg.merge_from_file(self.cfg_path)
    cfg.MODEL.DEVICE = self.device
    cfg.MODEL.WEIGHTS = ""  # Optionally specify pre-trained weights
    cfg.INPUT.FORMAT = "RGB"
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 2  # 1 class + 1 background

    self.model = modeling.build_model(cfg)
    self.simclr_backbone = torch.nn.Sequential(
        self.model.backbone.bottom_up,
        SelectStage("res5"),
        torch.nn.AdaptiveAvgPool2d(1),
    ).to(self.device)
    self.projection_head = SimCLRProjectionHead(
        input_dim=self.num_ftrs,
        hidden_dim=self.num_ftrs,
        output_dim=128,
    ).to(self.device)

def setup_data_loader(self):
    transform = SimCLRTransform(input_size=self.input_size)
    dataset = LightlyDataset(input_dir=self.data_path, transform=transform)
    self.dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=self.batch_size,
        shuffle=True,
        drop_last=True,
        num_workers=4,
    )

def train(self):
    self.setup_model()
    self.setup_data_loader()
    criterion = NTXentLoss()
    optimizer = torch.optim.Adam(
        list(self.simclr_backbone.parameters()) + list(self.projection_head.parameters()),
        lr=self.lr,
    )

    for e in range(self.max_epochs):
        mean_loss = 0.0
        for (x0, x1), _, _ in self.dataloader:
            x0, x1 = x0.to(self.device), x1.to(self.device)
            y0 = self.projection_head(self.simclr_backbone(x0).flatten(start_dim=1))
            y1 = self.projection_head(self.simclr_backbone(x1).flatten(start_dim=1))
            loss = criterion(y0, y1)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            mean_loss += loss.detach().cpu().item() / len(self.dataloader)
        print(f"[Epoch {e:2d}] Mean Loss = {mean_loss:.2f}")

def save_model(self):
    self.model.backbone.bottom_up = self.simclr_backbone[0]
    checkpointer = DetectionCheckpointer(self.model, save_dir="./")
    checkpointer.save("my_model")

if name == 'main': trainer = Detectron2LightlyTrainer( data_path=r"C:\Users\User\Desktop\Ronit-Projects\self-supervised\data", cfg_path=r"C:\Users\User\Desktop\Ronit-Projects\self-supervised\detectron2\configs\COCO-Detection\retinanet_R_50_FPN_1x.yaml", device="cuda" if torch.cuda.is_available() else "cpu", batch_size=2, input_size=600, num_ftrs=2048, max_epochs=5, lr=1e-4 )

trainer.train()
trainer.save_model()

and here is my detectron training script import os import json import pickle import sys

Inserting the path for custom libraries

sys.path.insert(0, os.path.abspath("./detectron2-multiband")) os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" from detectron2.data import build_detection_test_loader from detectron2.utils.logger import setup_logger from detectron2.data.datasets import register_coco_instances from detectron2.engine import DefaultTrainer, DefaultPredictor from detectron2.config import get_cfg from detectron2.evaluation import COCOEvaluator, inference_on_dataset

setup_logger()

def setup_config(config_json): cfg = get_cfg() cfg.merge_from_file("detectron2-multiband\configs\Base-RCNN-FPN.yaml") cfg.DATASETS.TRAIN = ("my_train_dataset",) cfg.DATASETS.TEST = ("my_test_dataset",) cfg.DATALOADER.NUM_WORKERS = 4

cfg.MODEL.WEIGHTS = config_json["pretrained_weights"]
cfg.SOLVER.IMS_PER_BATCH = config_json["batch_size"]
cfg.SOLVER.BASE_LR = config_json["learning_rate"]
with open(config_json["train_json_annot_path"]) as f:
    num_samples = len(json.load(f))
cfg.SOLVER.MAX_ITER = (num_samples * config_json["epochs"]) // cfg.SOLVER.IMS_PER_BATCH
cfg.MODEL.ROI_HEADS.NUM_CLASSES = config_json["num_classes"]
cfg.OUTPUT_DIR = config_json["output_dir"]

# Adjust SOLVER.STEPS to be within the range of SOLVER.MAX_ITER
step_values = [step for step in [1000, 2000, 5000] if step < cfg.SOLVER.MAX_ITER]  # Example steps
cfg.SOLVER.STEPS = tuple(step_values)
cfg.SOLVER.AMP.ENABLED = False

return cfg

def register_datasets(train_json_annot_path, train_images_path, test_json_annot_path, test_images_path): register_coco_instances("my_train_dataset", {}, train_json_annot_path, train_images_path) register_coco_instances("my_test_dataset", {}, test_json_annot_path, test_images_path)

def main(config_json): register_datasets( config_json["train_json_annot_path"], config_json["train_images_path"], config_json["test_json_annot_path"], config_json["test_images_path"]) cfg = setup_config(config_json) os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

# with open(config_json["cfg_path"], 'wb') as f:
#     pickle.dump(cfg, f, protocol=pickle.HIGHEST_PROTOCOL)

# with open(os.path.join(cfg.OUTPUT_DIR, "model_final.pth"), "rb") as f:
#     cfg.MODEL.WEIGHTS = f.read()

# predictor = DefaultPredictor(cfg)
# results_path = os.path.join(cfg.OUTPUT_DIR, 'results_evaluation')
# os.makedirs(results_path, exist_ok=True)

# evaluator = COCOEvaluator("my_test_dataset", cfg, False, output_dir=results_path)
# val_loader = build_detection_test_loader(cfg, "my_test_dataset")
# metrics = inference_on_dataset(predictor.model, val_loader, evaluator)
# print(metrics)

if name == 'main': import argparse parser = argparse.ArgumentParser(description="Train a model with Detectron2") parser.add_argument("-c", "--config_json", required=True, help="Path to the configuration JSON file") args = parser.parse_args() if args.config_json is None: print("Config file is required") sys.exit()

try:
    with open(args.config_json) as f:
        config = json.load(f)
    main(config)
except Exception as e:
    print(f"Failed to load or process configuration: {e}")
    sys.exit(1)
guarin commented 1 month ago

Hi! It looks like the code in your issue description is not correctly formatted, could you put it into a single markdown codeblock?

Regarding the training issues, when exactly does the loss diverge? During the pre-training with lightly or during the fine-tuning with detectron2?

ronit450 commented 1 month ago

It happens during the fine-tuning with detectron2. The training with lightly goes smooth. Here is the script I am using for fine tuning Detectron with these pretrained initial weights

[def setup_config(config_json):
    cfg = get_cfg()
    cfg.merge_from_file("detectron2-multiband\configs\Base-RCNN-FPN.yaml")
    cfg.DATASETS.TRAIN = ("my_train_dataset",)
    cfg.DATASETS.TEST = ("my_test_dataset",)
    cfg.DATALOADER.NUM_WORKERS = 4

    cfg.MODEL.WEIGHTS = config_json["pretrained_weights"]
    cfg.SOLVER.IMS_PER_BATCH = config_json["batch_size"]
    cfg.SOLVER.BASE_LR = config_json["learning_rate"]
    with open(config_json["train_json_annot_path"]) as f:
        num_samples = len(json.load(f))
    cfg.SOLVER.MAX_ITER = (num_samples * config_json["epochs"]) // cfg.SOLVER.IMS_PER_BATCH
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = config_json["num_classes"]
    cfg.OUTPUT_DIR = config_json["output_dir"]

    # Adjust SOLVER.STEPS to be within the range of SOLVER.MAX_ITER
    step_values = [step for step in [1000, 2000, 5000] if step < cfg.SOLVER.MAX_ITER]  # Example steps
    cfg.SOLVER.STEPS = tuple(step_values)
    cfg.SOLVER.AMP.ENABLED = False

    return cfg

def register_datasets(train_json_annot_path, train_images_path, test_json_annot_path, test_images_path):
    register_coco_instances("my_train_dataset", {}, train_json_annot_path, train_images_path)
    register_coco_instances("my_test_dataset", {}, test_json_annot_path, test_images_path)

def main(config_json):
    register_datasets(
        config_json["train_json_annot_path"], 
        config_json["train_images_path"], 
        config_json["test_json_annot_path"], 
        config_json["test_images_path"])
    cfg = setup_config(config_json)
    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

    trainer = DefaultTrainer(cfg)
    trainer.resume_or_load(resume=False)
    trainer.train()](url)
guarin commented 1 month ago

Does the same happen if you don't load the lightly pre-trained checkpoint? My suspicion would be that the learning rate is too high.

Some other points to consider:

ronit450 commented 1 month ago

Yes this only happens while fine tuning with Lightly backbone. I have tried lowering learning rate but still nothing seems to work.

guarin commented 1 month ago

Could you provide a minimal reproducible example so we can test on our side?

philippmwirth commented 1 month ago

Note that it's important, the images are normalized to zero-mean during the finetuning. Can you confirm you did the following?

python train_net.py --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
    MODEL.WEIGHTS path/to/my_model.pth \
    MODEL.PIXEL_MEAN 123.675,116.280,103.530 \
    MODEL.PIXEL_STD 58.395,57.120,57.375 \
    INPUT.FORMAT RGB
ronit450 commented 1 month ago

Nevermind solved the issue. I was not placing this MODEL.PIXEL_STD 58.395,57.120,57.37

I need some help as I am following the detectron2 script and produced my own training script for this using the given components. I have some agriculture images. The issue is loss starts at 1.7 and be at 1.4 even after 500 epochs. What can be done for this


import sys
import os
import torch
sys.path.insert(0, os.path.abspath("./detectron2"))
from detectron2 import config, modeling
from detectron2.checkpoint import DetectionCheckpointer
from lightly.data import LightlyDataset
# from lightly.loss import SupConLoss
# from lightly.loss import NTXentLos
from lightly.loss import NTXentLoss
from lightly.models.modules import SimCLRProjectionHead
from lightly.transforms import SimCLRTransform

class SelectStage(torch.nn.Module):
    def __init__(self, stage: str = "res5"):
        super().__init__()
        self.stage = stage

    def forward(self, x):
        return x[self.stage]

class Detectron2LightlyTrainer:
    def __init__(self, data_path, cfg_path, device, batch_size, input_size, num_ftrs, max_epochs, lr, temperature):
        self.data_path = data_path
        self.cfg_path = cfg_path
        self.device = device
        self.batch_size = batch_size
        self.input_size = input_size
        self.num_ftrs = num_ftrs
        self.max_epochs = max_epochs
        self.lr = lr
        self.temperature = temperature

    def setup_model(self):
        cfg = config.get_cfg()
        cfg.merge_from_file(self.cfg_path)
        cfg.MODEL.DEVICE = self.device
        cfg.MODEL.WEIGHTS = ""
        cfg.INPUT.FORMAT = "RGB"
        self.model = modeling.build_model(cfg)
        self.simclr_backbone = torch.nn.Sequential(
            self.model.backbone.bottom_up,
            SelectStage("res5"),
            torch.nn.AdaptiveAvgPool2d(1)
        ).to(self.device)
        self.projection_head = torch.nn.Sequential(
            SimCLRProjectionHead(input_dim=self.num_ftrs, hidden_dim=self.num_ftrs*2, output_dim=128),
            torch.nn.BatchNorm1d(128)
        ).to(self.device)

    def setup_data_loader(self):
        transform = SimCLRTransform(input_size=self.input_size)
        dataset = LightlyDataset(input_dir=self.data_path, transform=transform)
        self.dataloader = torch.utils.data.DataLoader(
            dataset,
            batch_size=self.batch_size,
            shuffle=True,
            drop_last=True,
            num_workers=8
        )

    def train(self):
        self.setup_model()
        self.setup_data_loader()
        criterion =  NTXentLoss(temperature=self.temperature)
        optimizer = torch.optim.Adam(
            list(self.simclr_backbone.parameters()) + list(self.projection_head.parameters()),
            lr=self.lr
        )
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=5, min_lr=1e-6)

        for e in range(self.max_epochs):
            mean_loss = 0.0
            for (x0, x1), _, _ in self.dataloader:
                x0, x1 = x0.to(self.device), x1.to(self.device)
                y0 = self.projection_head(self.simclr_backbone(x0).flatten(start_dim=1))
                y1 = self.projection_head(self.simclr_backbone(x1).flatten(start_dim=1))
                loss = criterion(y0, y1)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                mean_loss += loss.detach().cpu().item() / len(self.dataloader)
            scheduler.step(mean_loss)
            print(f"[Epoch {e:2d}] Mean Loss = {mean_loss:.4f}")

    def save_model(self):
        self.model.backbone.bottom_up = self.simclr_backbone[0]
        checkpointer = DetectionCheckpointer(self.model, save_dir="./")
        checkpointer.save("my_model")

if __name__ == '__main__':
    trainer = Detectron2LightlyTrainer(
        data_path = "/home/biosense/Documents/Rana/Ronit/Yolo-Train_dup/semi",
        cfg_path = "/home/biosense/Documents/Rana/Ronit/Self-Supervised/detectron2-multiband/configs/Base-RCNN-FPN.yaml",
        device="cuda" if torch.cuda.is_available() else "cpu",
        batch_size = 4 ,
        input_size = 600,
        num_ftrs  = 2048,
        max_epochs = 100, 
        lr =  1e-4,
        temperature=0.5

        )

    trainer.train()
    trainer.save_model()
guarin commented 1 month ago

Looking at your script you should first try to increase the batch size. SimCLR works best with large batch sizes. I would also suggest to reduce temperature to 0.1.

ronit450 commented 1 month ago

tried, I have 1500 unlabeled images, I tried with 64 batch size, let me reduce temperature to 0.1

guarin commented 4 days ago

I'll close this issue for now, feel free to reopen if you have more questions :)