mapillary / seamseg

Seamless Scene Segmentation
BSD 3-Clause "New" or "Revised" License
289 stars 53 forks source link

Training panoptic #2

Closed gaussiangit closed 5 years ago

gaussiangit commented 5 years ago

I have started training with 4GPUs with batch size of 1 and pretrained resnet101. However it gets stuck at starting epoch 1. No error msg or anything. Following is my panoptic config file that I use.


# GENERAL NOTE: the fields denoted as meta-info are not actual configuration parameters. Instead, they are used to
# describe some characteristic of a network module that needs to be accessible from some other module but is hard to
# determine in a generic way from within the code. A typical example is the total output stride of the network body.
# These should be properly configured by the user to match the actual properties of the network.

[general]
# Number of epochs between validations
val_interval = 1
# Number of steps before outputting a log entry
log_interval = 10
# Panoptic evaluation parameters
score_threshold = 0.5
overlap_threshold = 0.5
min_stuff_area = 4096
# Evaluation mode for mIoU and AP:
# -- panoptic: evaluate the scores on the panoptic output
# -- separate: evaluate the scores on the separate segmentation and detection outputs
eval_mode = panoptic
# Whether to compute COCO evaluation scores or not
eval_coco = no
cudnn_benchmark = no

[body]
# Architecture for the body
body = resnet101
# Path to pre-trained weights
weights = models/resnet101
# Normalization mode:
# -- bn: in-place batch norm everywhere
# -- syncbn: synchronized in-place batch norm everywhere
# -- syncbn+bn: synchronized in-place batch norm in the static part of the network, in-place batch norm everywhere else
# -- gn: group norm everywhere
# -- syncbn+gn: synchronized in-place batch norm in the static part of the network, group norm everywhere else
# -- off: do not normalize activations (scale and bias are kept)
normalization_mode = syncbn
# Activation: 'leaky_relu' or 'elu'
activation = leaky_relu
activation_slope = 0.01
# Group norm parameters
gn_groups = 16
# Additional parameters for the body
body_params = {}
# Number of frozen modules: in [1, 5]
num_frozen = 2
# Wether to freeze BN modules
bn_frozen = yes
# Meta-info
out_channels = {"mod1": 64, "mod2": 256, "mod3": 512, "mod4": 1024, "mod5": 2048}
out_strides = {"mod1": 4, "mod2": 4, "mod3": 8, "mod4": 16, "mod5": 32}

[fpn]
out_channels = 256
extra_scales = 0
interpolation = nearest
# Input settings
inputs = ["mod2", "mod3", "mod4", "mod5"]
# Meta-info
out_strides = (4, 8, 16, 32)

[rpn]
hidden_channels = 256
stride = 1
# Anchor settings
anchor_ratios = (1., 0.5, 2.)
anchor_scale = 8
# Proposal settings
nms_threshold = 0.7
num_pre_nms_train = 12000
num_post_nms_train = 2000
num_pre_nms_val = 6000
num_post_nms_val = 300
min_size = 16
# Anchor matcher settings
num_samples = 256
pos_ratio = .5
pos_threshold = .7
neg_threshold = .3
void_threshold = 0.7
# FPN-specific settings
fpn_min_level = 0
fpn_levels = 4
# Loss settings
sigma = 3.

[roi]
roi_size = (14, 14)
# Matcher settings
num_samples = 128
pos_ratio = .25
pos_threshold = .5
neg_threshold_hi = .5
neg_threshold_lo = 0.
void_threshold = 0.7
void_is_background = no
# Prediction generator settings
nms_threshold = 0.3
score_threshold = 0.05
max_predictions = 100
# FPN-specific settings
fpn_min_level = 0
fpn_levels = 4
fpn_canonical_scale = 224
fpn_canonical_level = 2
# Loss settings
sigma = 1.
bbx_reg_weights = (10., 10., 5., 5.)

[sem]
fpn_min_level = 0
fpn_levels = 4
pooling_size = (64, 64)
# Loss settings
ohem = .25

[optimizer]
lr = 0.01
weight_decay = 0.0001
weight_decay_norm = yes
momentum = 0.9
nesterov = yes
# obj, bbx, roi_cls, roi_bbx, roi_msk, sem
loss_weights = (1., 1., 1., 1., 1., 1.)

[scheduler]
epochs = 90
# Scheduler type: 'linear', 'step', 'poly' or 'multistep'
type = poly
# When to update the learning rate: 'batch', 'epoch'
update_mode = batch
# Additional parameters for the scheduler
# -- linear
#   from: initial lr multiplier
#   to: final lr multiplier
# -- step
#   step_size: number of steps between lr decreases
#   gamma: multiplicative factor
# -- poly
#   gamma: exponent of the polynomial
# -- multistep
#   milestones: step indicies where the lr decreases will be triggered
params = {"gamma": 0.9}
burn_in_steps = 500
burn_in_start = 0.333

[dataloader]
# Image size parameters
shortest_size = 1024
longest_max_size = 2048
# Batch size
train_batch_size =1
val_batch_size =1
# Augmentation parameters
rgb_mean = (0.485, 0.456, 0.406)
rgb_std = (0.229, 0.224, 0.225)
random_flip = yes
random_scale = (0.8, 1)
# Number of worker threads
num_workers = 8
# Subsets
train_set = train
val_set = val
coco_gt =