FCENet loss is Nan - Githubissues

yCobanoglu commented 2 years ago

FCENet with dcvn2 leads to loss beeing NaN. If instead FCNet without dcnv2 is used it starts out with loss beeing a number but after a couple of epoch it also is NaN.

Reproduction

I will post the code for training (train.py) and the two configurations file one for fcenet_dcvn2 (fcenet_dcvn.py) one config for fcenet_no_dcvn2 (fcenet_no_dcvn.py). The only difference is the dcvn2 part.

Can this be an issue with my data ? I created my own coco.json icdar2015 style. The only difference is that the gt_image.txt have bounding boxes which are just x,y,width,height. But the json files should be similar to the ones created by you from gt_image.txt which have x1,y1,x2,y2,x3,y3,x4,y4. Are the gt_.txt used in training ? I was expecting the ground truth labels coming from the json file not from the .txt files. In that case this is a problem unrelated to my data/json files.

What command or script did you run?

train.py

Error traceback fcenet_dcvn2.py

2021-09-19 08:24:51,978 - mmocr - INFO - Epoch [1][10/18]   lr: 5.000e-03, eta: 5:56:28, time: 1.586, data_time: 1.065, memory: 9432, loss_text: nan, loss_center: nan, loss_reg_x: nan, loss_reg_y: nan, loss: nan
2021-09-19 08:25:11,136 - mmocr - INFO - Epoch(val) [1][4]  loss_text: nan, loss_center: nan, loss_reg_x: nan, loss_reg_y: nan, loss: nan

Running train.py and changing the config (from cuneiform_sign_detection.configs.fcenet_default import fcenet_no_dcvn as model_config) and checkpoint (checkpoint=None) and loss is a number but after a couple of epochs its also Nan:

fcenet_dcvn2.py

2021-09-19 08:50:15,862 - mmocr - INFO - Epoch [1][10/18]       lr: 5.000e-03, eta: 5:56:46, time: 1.587, data_time: 1.270, memory: 9320, loss_text: 4.2834, loss_center: 4.9790, loss_reg_x: 6.1018, loss_reg_y: 4.0760, loss: 19.4403
2021-09-19 08:50:31,303 - mmocr - INFO - Epoch(val) [1][4]      loss_text: 795869.5312, loss_center: 242371.4941, loss_reg_x: 306783.1055, loss_reg_y: 293962.2539, loss: 1638986.3594
 ...
2021-09-19 08:56:17,075 - mmocr - INFO - Epoch(val) [3][4]      loss_text: nan, loss_center: nan, loss_reg_x: nan, loss_reg_y: nan, loss: nan
2021-09-19 08:56:34,126 - mmocr - INFO - Epoch [4][10/18]       lr: 4.982e-03, eta: 3:27:01, time: 1.689, data_time: 1.389, memory: 9320, loss_text: nan, loss_center: nan, loss_reg_x: nan, loss_reg_y: nan, loss: nan

Train.py

import os.path as osp
from pathlib import Path

import mmcv
from mmcv import Config
from mmocr.apis import train_detector
from mmocr.datasets import build_dataset
from mmocr.models import build_detector

from cuneiform_sign_detection.utils.calculate_mean_and_std import (
    mean_and_std_from_data_path,
)
from cuneiform_sign_detection.utils.create_val_dataset import create_val_dataset
from cuneiform_sign_detection.utils.path import (
    log_version_increment,
)
from cuneiform_sign_detection.utils.user_input import user_input

def configure_cfg(
    config_file_path: str,
    data_path: str,
    load_from_checkpoint=None,
    resume_from_checkpoint=None,
    gpu_ids=range(1),
    log_directory="./logs",
):

    cfg = Config.fromfile(config_file_path)
    mean, std = mean_and_std_from_data_path(data_path)
    print("------------Mean and Std------------------")
    print(f"Mean: {str(mean)}")
    print(f"Std: {str(std)}")
    print("-------------------------------------------")

    cfg.img_norm_cfg = dict(mean=mean, std=std, to_rgb=True)

    log_dir = Path(log_directory) / Path(config_file_path).stem
    model_version = log_version_increment(log_dir)

    cfg.work_dir = str(log_dir / str(model_version))
    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))

    if load_from_checkpoint and resume_from_checkpoint:
        raise ValueError("Either specifcy checkpoint or resume_from parameter not both")

    cfg.load_from = load_from_checkpoint
    cfg.resume_from = resume_from_checkpoint
    cfg.gpu_ids = gpu_ids
    return cfg

def prepare_dataset(cfg):
    datasets = [build_dataset(cfg.data.train)]
    datasets.append(create_val_dataset(cfg))

    return datasets

def train(cfg, datasets) -> None:
    model = build_detector(
        cfg.model, train_cfg=cfg.get("train_cfg"), test_cfg=cfg.get("test_cfg")
    )
    model.CLASSES = datasets[0].CLASSES
    train_detector(model, datasets, cfg, distributed=False, validate=True)

if __name__ == "__main__":
    """
    User input after starting script describing experiment will be used for logging
    Press double enter after comment to start training
    data format is coco and in directory ../data
    """

    from cuneiform_sign_detection.configs.fcenet_default import fcenet_dcvn as model_config

    configFile = model_config.__file__
    load_from_checkpoint = "./checkpoints/fcenet_dcvn.pth"
    data_path = "./data"

    comment = user_input()
    cfg = configure_cfg(configFile, data_path, load_from_checkpoint)

    print(f"Config:\n{cfg.pretty_text}")
    (Path(cfg.work_dir) / "summary.txt").write_text(comment)
    cfg.dump(f"{cfg.work_dir}/{Path(configFile).name}")

    datasets = prepare_dataset(cfg)
    train(cfg, datasets)

fcenet_dcvn.py

fourier_degree = 5
model = dict(
    type="FCENet",
    backbone=dict(
        type="mmdet.ResNet",
        depth=50,
        num_stages=4,
        out_indices=(1, 2, 3),
        frozen_stages=-1,
        norm_cfg=dict(type="BN", requires_grad=True),
        norm_eval=True,
        style="pytorch",
        dcn=dict(type="DCNv2", deform_groups=2, fallback_on_stride=False),
        init_cfg=dict(type="Pretrained", checkpoint="torchvision://resnet50"),
        stage_with_dcn=(False, True, True, True),
    ),
    neck=dict(
        type="mmdet.FPN",
        in_channels=[512, 1024, 2048],
        out_channels=256,
        add_extra_convs="on_output",
        num_outs=3,
        relu_before_extra_convs=True,
        act_cfg=None,
    ),
    bbox_head=dict(
        type="FCEHead",
        in_channels=256,
        scales=(8, 16, 32),
        loss=dict(type="FCELoss"),
        fourier_degree=fourier_degree,
    ),
)

train_cfg = None
test_cfg = None

dataset_type = "IcdarDataset"
data_root = "./data"

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True
)

train_pipeline = [
    dict(type="LoadImageFromFile", color_type="color_ignore_orientation"),
    dict(type="LoadTextAnnotations", with_bbox=True, with_mask=True, poly2mask=False),
    dict(type="ColorJitter", brightness=32.0 / 255, saturation=0.5, contrast=0.5),
    dict(type="Normalize", **img_norm_cfg),
    dict(type="RandomScaling", size=800, scale=(3.0 / 4, 5.0 / 2)),
    dict(type="RandomCropFlip", crop_ratio=0.5, iter_num=1, min_area_ratio=0.2),
    dict(
        type="RandomCropPolyInstances",
        instance_key="gt_masks",
        crop_ratio=0.8,
        min_side_ratio=0.3,
    ),
    dict(
        type="RandomRotatePolyInstances",
        rotate_ratio=0.5,
        max_angle=30,
        pad_with_fixed_color=False,
    ),
    dict(type="SquareResizePad", target_size=800, pad_ratio=0.6),
    dict(type="RandomFlip", flip_ratio=0.5, direction="horizontal"),
    dict(type="Pad", size_divisor=32),
    dict(
        type="FCENetTargets",
        fourier_degree=fourier_degree,
        level_proportion_range=((0, 0.25), (0.2, 0.65), (0.55, 1.0)),
    ),
    dict(
        type="CustomFormatBundle",
        keys=["p3_maps", "p4_maps", "p5_maps"],
        visualize=dict(flag=False, boundary_key=None),
    ),
    dict(type="Collect", keys=["img", "p3_maps", "p4_maps", "p5_maps"]),
]
test_pipeline = [
    dict(type="LoadImageFromFile", color_type="color_ignore_orientation"),
    dict(
        type="MultiScaleFlipAug",
        img_scale=(1080, 736),
        flip=False,
        transforms=[
            dict(type="Resize", img_scale=(1280, 800), keep_ratio=True),
            dict(type="Normalize", **img_norm_cfg),
            dict(type="Pad", size_divisor=32),
            dict(type="ImageToTensor", keys=["img"]),
            dict(type="Collect", keys=["img"]),
        ],
    ),
]
data = dict(
    samples_per_gpu=8,
    workers_per_gpu=4,
    val_dataloader=dict(samples_per_gpu=1),
    test_dataloader=dict(samples_per_gpu=1),
    train=dict(
        type=dataset_type,
        ann_file=data_root + "/instances_training.json",
        img_prefix=data_root + "/imgs",
        pipeline=train_pipeline,
    ),
    val=dict(
        type=dataset_type,
        ann_file=data_root + "/instances_validation.json",
        img_prefix=data_root + "/imgs",
        pipeline=test_pipeline,
    ),
    test=dict(
        type=dataset_type,
        ann_file=data_root + "/instances_validation.json",
        img_prefix=data_root + "/imgs",
        pipeline=test_pipeline,
    ),
)
evaluation = dict(
    interval=20, metric=["hmean-iou"], save_best="hmean-iou:hmean", rule="greater"
)

# optimizer
optimizer = dict(type="SGD", lr=5e-3, momentum=0.90, weight_decay=5e-4)
optimizer_config = dict(grad_clip=None)
lr_config = dict(policy="poly", power=0.9, min_lr=1e-7, by_epoch=True)
total_epochs = 750

checkpoint_config = dict(interval=25)
# yapf:disable
log_config = dict(
    interval=1, hooks=[dict(type="TensorboardLoggerHook"), dict(type="TextLoggerHook")]
)
# yapf:enable
dist_params = dict(backend="nccl")
log_level = "INFO"
load_from = None
resume_from = None
workflow = [("train", 1), ("val", 10)]

fcenet_no_dcvn.py

fourier_degree = 5
model = dict(
    type="FCENet",
    backbone=dict(
        type="mmdet.ResNet",
        depth=50,
        num_stages=4,
        out_indices=(1, 2, 3),
        frozen_stages=-1,
        norm_cfg=dict(type="BN", requires_grad=True),
        init_cfg=dict(type="Pretrained", checkpoint="torchvision://resnet50"),
        norm_eval=False,
        style="pytorch",
    ),
    neck=dict(
        type="mmdet.FPN",
        in_channels=[512, 1024, 2048],
        out_channels=256,
        add_extra_convs="on_output",
        num_outs=3,
        relu_before_extra_convs=True,
        act_cfg=None,
    ),
    bbox_head=dict(
        type="FCEHead",
        in_channels=256,
        scales=(8, 16, 32),
        loss=dict(type="FCELoss"),
        alpha=1.2,
        beta=1.0,
        text_repr_type="quad",
        fourier_degree=fourier_degree,
    ),
)

train_cfg = None
test_cfg = None

dataset_type = "IcdarDataset"
data_root = "./data"

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True
)

train_pipeline = [
    dict(type="LoadImageFromFile", color_type="color_ignore_orientation"),
    dict(type="LoadTextAnnotations", with_bbox=True, with_mask=True, poly2mask=False),
    dict(type="ColorJitter", brightness=32.0 / 255, saturation=0.5, contrast=0.5),
    dict(type="Normalize", **img_norm_cfg),
    dict(type="RandomScaling", size=800, scale=(3.0 / 4, 5.0 / 2)),
    dict(type="RandomCropFlip", crop_ratio=0.5, iter_num=1, min_area_ratio=0.2),
    dict(
        type="RandomCropPolyInstances",
        instance_key="gt_masks",
        crop_ratio=0.8,
        min_side_ratio=0.3,
    ),
    dict(
        type="RandomRotatePolyInstances",
        rotate_ratio=0.5,
        max_angle=30,
        pad_with_fixed_color=False,
    ),
    dict(type="SquareResizePad", target_size=800, pad_ratio=0.6),
    dict(type="RandomFlip", flip_ratio=0.5, direction="horizontal"),
    dict(type="Pad", size_divisor=32),
    dict(
        type="FCENetTargets",
        fourier_degree=fourier_degree,
        level_proportion_range=((0, 0.4), (0.3, 0.7), (0.6, 1.0)),
    ),
    dict(
        type="CustomFormatBundle",
        keys=["p3_maps", "p4_maps", "p5_maps"],
        visualize=dict(flag=False, boundary_key=None),
    ),
    dict(type="Collect", keys=["img", "p3_maps", "p4_maps", "p5_maps"]),
]
test_pipeline = [
    dict(type="LoadImageFromFile", color_type="color_ignore_orientation"),
    dict(
        type="MultiScaleFlipAug",
        img_scale=(2260, 2260),
        flip=False,
        transforms=[
            dict(type="Resize", img_scale=(1280, 800), keep_ratio=True),
            dict(type="Normalize", **img_norm_cfg),
            dict(type="Pad", size_divisor=32),
            dict(type="ImageToTensor", keys=["img"]),
            dict(type="Collect", keys=["img"]),
        ],
    ),
]
data = dict(
    samples_per_gpu=8,
    workers_per_gpu=4,
    val_dataloader=dict(samples_per_gpu=1),
    test_dataloader=dict(samples_per_gpu=1),
    train=dict(
        type=dataset_type,
        ann_file=data_root + "/instances_training.json",
        img_prefix=data_root + "/imgs",
        pipeline=train_pipeline,
    ),
    val=dict(
        type=dataset_type,
        ann_file=data_root + "/instances_validation.json",
        img_prefix=data_root + "/imgs",
        pipeline=test_pipeline,
    ),
    test=dict(
        type=dataset_type,
        ann_file=data_root + "/instances_validation.json",
        img_prefix=data_root + "/imgs",
        pipeline=test_pipeline,
    ),
)
evaluation = dict(
    interval=20, metric=["hmean-iou"], save_best="hmean-iou:hmean", rule="greater"
)

# optimizer
optimizer = dict(type="SGD", lr=1e-3, momentum=0.90, weight_decay=5e-4)
optimizer_config = dict(grad_clip=None)
lr_config = dict(policy="poly", power=0.9, min_lr=1e-7, by_epoch=True)
total_epochs = 750

checkpoint_config = dict(interval=25)
# yapf:disable
log_config = dict(
    interval=1, hooks=[dict(type="TensorboardLoggerHook"), dict(type="TextLoggerHook")]
)
# yapf:enable
dist_params = dict(backend="nccl")
log_level = "INFO"
load_from = None
resume_from = None
workflow = [("train", 1), ("val", 10)]

gaotongxiao commented 2 years ago

Have you tried a lower learning rate?
MMOCR does load data from .json. files, so your gt_*.txt couldn't be the cause. But you should ensure that your data annotation is consistent with how the image has been processed and that all bounding boxes are inside of the image (check notes for the explanation). The quickest way to check is to remove , color_type="color_ignore_orientation" from train_pipeline and test_pipeline and check if the issue persists.
You could also switch the data source to ICDAR 2015 that we officially support to locate the problem.

yCobanoglu commented 2 years ago

learning rate was to high i removed color_type="color_ignore_orientation and training again thanks.

yauauau commented 2 years ago

@.***

On Wed, Oct 6, 2021, 5:40 AM Yunus Cobanoglu @.***> wrote:

Closed #500 https://github.com/open-mmlab/mmocr/issues/500.

— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/open-mmlab/mmocr/issues/500#event-5418207561, or unsubscribe https://github.com/notifications/unsubscribe-auth/ATUUQHVFJS5WQMCDYZFEMA3UFQKSRANCNFSM5EKC344A .

happy20200 commented 2 years ago

2. train_pipeline and test_pipeline

where is the train_pipeline and test_pipeline?

open-mmlab / mmocr

FCENet loss is Nan #500