Closed ronit450 closed 4 days ago
Hi! It looks like the code in your issue description is not correctly formatted, could you put it into a single markdown codeblock?
Regarding the training issues, when exactly does the loss diverge? During the pre-training with lightly or during the fine-tuning with detectron2?
It happens during the fine-tuning with detectron2. The training with lightly goes smooth. Here is the script I am using for fine tuning Detectron with these pretrained initial weights
[def setup_config(config_json):
cfg = get_cfg()
cfg.merge_from_file("detectron2-multiband\configs\Base-RCNN-FPN.yaml")
cfg.DATASETS.TRAIN = ("my_train_dataset",)
cfg.DATASETS.TEST = ("my_test_dataset",)
cfg.DATALOADER.NUM_WORKERS = 4
cfg.MODEL.WEIGHTS = config_json["pretrained_weights"]
cfg.SOLVER.IMS_PER_BATCH = config_json["batch_size"]
cfg.SOLVER.BASE_LR = config_json["learning_rate"]
with open(config_json["train_json_annot_path"]) as f:
num_samples = len(json.load(f))
cfg.SOLVER.MAX_ITER = (num_samples * config_json["epochs"]) // cfg.SOLVER.IMS_PER_BATCH
cfg.MODEL.ROI_HEADS.NUM_CLASSES = config_json["num_classes"]
cfg.OUTPUT_DIR = config_json["output_dir"]
# Adjust SOLVER.STEPS to be within the range of SOLVER.MAX_ITER
step_values = [step for step in [1000, 2000, 5000] if step < cfg.SOLVER.MAX_ITER] # Example steps
cfg.SOLVER.STEPS = tuple(step_values)
cfg.SOLVER.AMP.ENABLED = False
return cfg
def register_datasets(train_json_annot_path, train_images_path, test_json_annot_path, test_images_path):
register_coco_instances("my_train_dataset", {}, train_json_annot_path, train_images_path)
register_coco_instances("my_test_dataset", {}, test_json_annot_path, test_images_path)
def main(config_json):
register_datasets(
config_json["train_json_annot_path"],
config_json["train_images_path"],
config_json["test_json_annot_path"],
config_json["test_images_path"])
cfg = setup_config(config_json)
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()](url)
Does the same happen if you don't load the lightly pre-trained checkpoint? My suspicion would be that the learning rate is too high.
Some other points to consider:
"detectron2-multiband\configs\Base-RCNN-FPN.yaml"
when creating the model for pre-training and for fine-tuning.Yes this only happens while fine tuning with Lightly backbone. I have tried lowering learning rate but still nothing seems to work.
Could you provide a minimal reproducible example so we can test on our side?
Note that it's important, the images are normalized to zero-mean during the finetuning. Can you confirm you did the following?
python train_net.py --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
MODEL.WEIGHTS path/to/my_model.pth \
MODEL.PIXEL_MEAN 123.675,116.280,103.530 \
MODEL.PIXEL_STD 58.395,57.120,57.375 \
INPUT.FORMAT RGB
Nevermind solved the issue. I was not placing this MODEL.PIXEL_STD 58.395,57.120,57.37
I need some help as I am following the detectron2 script and produced my own training script for this using the given components. I have some agriculture images. The issue is loss starts at 1.7 and be at 1.4 even after 500 epochs. What can be done for this
import sys
import os
import torch
sys.path.insert(0, os.path.abspath("./detectron2"))
from detectron2 import config, modeling
from detectron2.checkpoint import DetectionCheckpointer
from lightly.data import LightlyDataset
# from lightly.loss import SupConLoss
# from lightly.loss import NTXentLos
from lightly.loss import NTXentLoss
from lightly.models.modules import SimCLRProjectionHead
from lightly.transforms import SimCLRTransform
class SelectStage(torch.nn.Module):
def __init__(self, stage: str = "res5"):
super().__init__()
self.stage = stage
def forward(self, x):
return x[self.stage]
class Detectron2LightlyTrainer:
def __init__(self, data_path, cfg_path, device, batch_size, input_size, num_ftrs, max_epochs, lr, temperature):
self.data_path = data_path
self.cfg_path = cfg_path
self.device = device
self.batch_size = batch_size
self.input_size = input_size
self.num_ftrs = num_ftrs
self.max_epochs = max_epochs
self.lr = lr
self.temperature = temperature
def setup_model(self):
cfg = config.get_cfg()
cfg.merge_from_file(self.cfg_path)
cfg.MODEL.DEVICE = self.device
cfg.MODEL.WEIGHTS = ""
cfg.INPUT.FORMAT = "RGB"
self.model = modeling.build_model(cfg)
self.simclr_backbone = torch.nn.Sequential(
self.model.backbone.bottom_up,
SelectStage("res5"),
torch.nn.AdaptiveAvgPool2d(1)
).to(self.device)
self.projection_head = torch.nn.Sequential(
SimCLRProjectionHead(input_dim=self.num_ftrs, hidden_dim=self.num_ftrs*2, output_dim=128),
torch.nn.BatchNorm1d(128)
).to(self.device)
def setup_data_loader(self):
transform = SimCLRTransform(input_size=self.input_size)
dataset = LightlyDataset(input_dir=self.data_path, transform=transform)
self.dataloader = torch.utils.data.DataLoader(
dataset,
batch_size=self.batch_size,
shuffle=True,
drop_last=True,
num_workers=8
)
def train(self):
self.setup_model()
self.setup_data_loader()
criterion = NTXentLoss(temperature=self.temperature)
optimizer = torch.optim.Adam(
list(self.simclr_backbone.parameters()) + list(self.projection_head.parameters()),
lr=self.lr
)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=5, min_lr=1e-6)
for e in range(self.max_epochs):
mean_loss = 0.0
for (x0, x1), _, _ in self.dataloader:
x0, x1 = x0.to(self.device), x1.to(self.device)
y0 = self.projection_head(self.simclr_backbone(x0).flatten(start_dim=1))
y1 = self.projection_head(self.simclr_backbone(x1).flatten(start_dim=1))
loss = criterion(y0, y1)
loss.backward()
optimizer.step()
optimizer.zero_grad()
mean_loss += loss.detach().cpu().item() / len(self.dataloader)
scheduler.step(mean_loss)
print(f"[Epoch {e:2d}] Mean Loss = {mean_loss:.4f}")
def save_model(self):
self.model.backbone.bottom_up = self.simclr_backbone[0]
checkpointer = DetectionCheckpointer(self.model, save_dir="./")
checkpointer.save("my_model")
if __name__ == '__main__':
trainer = Detectron2LightlyTrainer(
data_path = "/home/biosense/Documents/Rana/Ronit/Yolo-Train_dup/semi",
cfg_path = "/home/biosense/Documents/Rana/Ronit/Self-Supervised/detectron2-multiband/configs/Base-RCNN-FPN.yaml",
device="cuda" if torch.cuda.is_available() else "cpu",
batch_size = 4 ,
input_size = 600,
num_ftrs = 2048,
max_epochs = 100,
lr = 1e-4,
temperature=0.5
)
trainer.train()
trainer.save_model()
Looking at your script you should first try to increase the batch size. SimCLR works best with large batch sizes. I would also suggest to reduce temperature to 0.1
.
tried, I have 1500 unlabeled images, I tried with 64 batch size, let me reduce temperature to 0.1
I'll close this issue for now, feel free to reopen if you have more questions :)
I am using tutorial 6 for traning a detectron2 backbone with my custom dataset and then using that to my detectron2 training script. But at this point the training gets diverged and in the first epochs the loss value is too high.
here is my Lightly SSL training script import sys import os sys.path.insert(0, os.path.abspath("./detectron2")) os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import torch from detectron2 import config, modeling from detectron2.checkpoint import DetectionCheckpointer from lightly.data import LightlyDataset from lightly.loss import NTXentLoss from lightly.models.modules import SimCLRProjectionHead from lightly.transforms import SimCLRTransform
class SelectStage(torch.nn.Module): """Selects features from a given stage.""" def init(self, stage: str = "res5"): super().init() self.stage = stage
class Detectron2LightlyTrainer: def init(self, data_path, cfg_path, device, batch_size, input_size, num_ftrs, max_epochs, lr): self.data_path = data_path self.cfg_path = cfg_path self.device = device self.batch_size = batch_size self.input_size = input_size self.num_ftrs = num_ftrs self.max_epochs = max_epochs self.model = None self.dataloader = None self.simclr_backbone = None self.projection_head = None self.lr = lr
if name == 'main': trainer = Detectron2LightlyTrainer( data_path=r"C:\Users\User\Desktop\Ronit-Projects\self-supervised\data", cfg_path=r"C:\Users\User\Desktop\Ronit-Projects\self-supervised\detectron2\configs\COCO-Detection\retinanet_R_50_FPN_1x.yaml", device="cuda" if torch.cuda.is_available() else "cpu", batch_size=2, input_size=600, num_ftrs=2048, max_epochs=5, lr=1e-4 )
and here is my detectron training script import os import json import pickle import sys
Inserting the path for custom libraries
sys.path.insert(0, os.path.abspath("./detectron2-multiband")) os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" from detectron2.data import build_detection_test_loader from detectron2.utils.logger import setup_logger from detectron2.data.datasets import register_coco_instances from detectron2.engine import DefaultTrainer, DefaultPredictor from detectron2.config import get_cfg from detectron2.evaluation import COCOEvaluator, inference_on_dataset
setup_logger()
def setup_config(config_json): cfg = get_cfg() cfg.merge_from_file("detectron2-multiband\configs\Base-RCNN-FPN.yaml") cfg.DATASETS.TRAIN = ("my_train_dataset",) cfg.DATASETS.TEST = ("my_test_dataset",) cfg.DATALOADER.NUM_WORKERS = 4
def register_datasets(train_json_annot_path, train_images_path, test_json_annot_path, test_images_path): register_coco_instances("my_train_dataset", {}, train_json_annot_path, train_images_path) register_coco_instances("my_test_dataset", {}, test_json_annot_path, test_images_path)
def main(config_json): register_datasets( config_json["train_json_annot_path"], config_json["train_images_path"], config_json["test_json_annot_path"], config_json["test_images_path"]) cfg = setup_config(config_json) os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
if name == 'main': import argparse parser = argparse.ArgumentParser(description="Train a model with Detectron2") parser.add_argument("-c", "--config_json", required=True, help="Path to the configuration JSON file") args = parser.parse_args() if args.config_json is None: print("Config file is required") sys.exit()