NVIDIA / caffe

Caffe: a fast open framework for deep learning.
http://caffe.berkeleyvision.org/
Other
672 stars 263 forks source link

Loss goes inf when training the VGG based SSD model using FP16 mode #551

Closed cathy-kim closed 5 years ago

cathy-kim commented 5 years ago

Hi. I would like to train SSD with pretrained model in FP16 mode. However, whether I use pretrained model or not, loss goes inf when I try to do it. How to make it? and What should I do for it. Could you give me prototxt files with VOC0712 dataset in FP16 mode. Thank you.

Here is my train.prototxt name: "VGG_VOC0712_SSD_300x300_train_fp16"

default_forward_type: FLOAT16 default_backward_type: FLOAT16

default_forward_math: FLOAT

default_backward_math: FLOAT

global_grad_scale: 1000.

global_grad_scale_adaptive: true

layer { forward_type: FLOAT backward_type: FLOAT

name: "data" type: "AnnotatedData" top: "data" top: "label" include { phase: TRAIN } transform_param { mirror: true mean_value: 104 mean_value: 117 mean_value: 123 resize_param { prob: 1 resize_mode: WARP height: 300 width: 300 interp_mode: LINEAR interp_mode: AREA interp_mode: NEAREST interp_mode: CUBIC interp_mode: LANCZOS4 } emit_constraint { emit_type: CENTER } distort_param { brightness_prob: 0.5 brightness_delta: 32 contrast_prob: 0.5 contrast_lower: 0.5 contrast_upper: 1.5 hue_prob: 0.5 hue_delta: 18 saturation_prob: 0.5 saturation_lower: 0.5 saturation_upper: 1.5 random_order_prob: 0.0 } expand_param { prob: 0.5 max_expand_ratio: 4.0 } } data_param { source: "/home/etri/data/VOCdevkit/VOC0712/lmdb/VOC0712_trainval_lmdb" batch_size: 32 backend: LMDB } annotated_data_param { batch_sampler { max_sample: 1 max_trials: 1 } batch_sampler { sampler { min_scale: 0.3 max_scale: 1.0 min_aspect_ratio: 0.5 max_aspect_ratio: 2.0 } sample_constraint { min_jaccard_overlap: 0.1 } max_sample: 1 max_trials: 50 } batch_sampler { sampler { min_scale: 0.3 max_scale: 1.0 min_aspect_ratio: 0.5 max_aspect_ratio: 2.0 } sample_constraint { min_jaccard_overlap: 0.3 } max_sample: 1 max_trials: 50 } batch_sampler { sampler { min_scale: 0.3 max_scale: 1.0 min_aspect_ratio: 0.5 max_aspect_ratio: 2.0 } sample_constraint { min_jaccard_overlap: 0.5 } max_sample: 1 max_trials: 50 } batch_sampler { sampler { min_scale: 0.3 max_scale: 1.0 min_aspect_ratio: 0.5 max_aspect_ratio: 2.0 } sample_constraint { min_jaccard_overlap: 0.7 } max_sample: 1 max_trials: 50 } batch_sampler { sampler { min_scale: 0.3 max_scale: 1.0 min_aspect_ratio: 0.5 max_aspect_ratio: 2.0 } sample_constraint { min_jaccard_overlap: 0.9 } max_sample: 1 max_trials: 50 } batch_sampler { sampler { min_scale: 0.3 max_scale: 1.0 min_aspect_ratio: 0.5 max_aspect_ratio: 2.0 } sample_constraint { max_jaccard_overlap: 1.0 } max_sample: 1 max_trials: 50 } label_map_file: "data/VOC0712/labelmap_voc.prototxt" } } layer { name: "conv1_1" type: "Convolution" bottom: "data" top: "conv1_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "relu1_1" type: "ReLU" bottom: "conv1_1" top: "conv1_1" } layer { name: "conv1_2" type: "Convolution" bottom: "conv1_1" top: "conv1_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "relu1_2" type: "ReLU" bottom: "conv1_2" top: "conv1_2" } layer { name: "pool1" type: "Pooling" bottom: "conv1_2" top: "pool1" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv2_1" type: "Convolution" bottom: "pool1" top: "conv2_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "relu2_1" type: "ReLU" bottom: "conv2_1" top: "conv2_1" } layer { name: "conv2_2" type: "Convolution" bottom: "conv2_1" top: "conv2_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "relu2_2" type: "ReLU" bottom: "conv2_2" top: "conv2_2" } layer { name: "pool2" type: "Pooling" bottom: "conv2_2" top: "pool2" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv3_1" type: "Convolution" bottom: "pool2" top: "conv3_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "relu3_1" type: "ReLU" bottom: "conv3_1" top: "conv3_1" } layer { name: "conv3_2" type: "Convolution" bottom: "conv3_1" top: "conv3_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "relu3_2" type: "ReLU" bottom: "conv3_2" top: "conv3_2" } layer { name: "conv3_3" type: "Convolution" bottom: "conv3_2" top: "conv3_3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "relu3_3" type: "ReLU" bottom: "conv3_3" top: "conv3_3" } layer { name: "pool3" type: "Pooling" bottom: "conv3_3" top: "pool3" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv4_1" type: "Convolution" bottom: "pool3" top: "conv4_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "relu4_1" type: "ReLU" bottom: "conv4_1" top: "conv4_1" } layer { name: "conv4_2" type: "Convolution" bottom: "conv4_1" top: "conv4_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "relu4_2" type: "ReLU" bottom: "conv4_2" top: "conv4_2" } layer { name: "conv4_3" type: "Convolution" bottom: "conv4_2" top: "conv4_3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "relu4_3" type: "ReLU" bottom: "conv4_3" top: "conv4_3" } layer { name: "pool4" type: "Pooling" bottom: "conv4_3" top: "pool4" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv5_1" type: "Convolution" bottom: "pool4" top: "conv5_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } dilation: 1 } } layer { name: "relu5_1" type: "ReLU" bottom: "conv5_1" top: "conv5_1" } layer { name: "conv5_2" type: "Convolution" bottom: "conv5_1" top: "conv5_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } dilation: 1 } } layer { name: "relu5_2" type: "ReLU" bottom: "conv5_2" top: "conv5_2" } layer { name: "conv5_3" type: "Convolution" bottom: "conv5_2" top: "conv5_3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } dilation: 1 } } layer { name: "relu5_3" type: "ReLU" bottom: "conv5_3" top: "conv5_3" } layer { name: "pool5" type: "Pooling" bottom: "conv5_3" top: "pool5" pooling_param { pool: MAX kernel_size: 3 stride: 1 pad: 1 } } layer { name: "fc6" type: "Convolution" bottom: "pool5" top: "fc6" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { engine:CAFFE

num_output: 1024
pad: 6
kernel_size: 3
weight_filler {
  type: "xavier"
}
bias_filler {
  type: "constant"
  value: 0
}
dilation: 6

} } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "fc7" type: "Convolution" bottom: "fc6" top: "fc7" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 1024 kernel_size: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "conv6_1" type: "Convolution" bottom: "fc7" top: "conv6_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 0 kernel_size: 1 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv6_1_relu" type: "ReLU" bottom: "conv6_1" top: "conv6_1" } layer { name: "conv6_2" type: "Convolution" bottom: "conv6_1" top: "conv6_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 stride: 2 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv6_2_relu" type: "ReLU" bottom: "conv6_2" top: "conv6_2" } layer { name: "conv7_1" type: "Convolution" bottom: "conv6_2" top: "conv7_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 128 pad: 0 kernel_size: 1 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv7_1_relu" type: "ReLU" bottom: "conv7_1" top: "conv7_1" } layer { name: "conv7_2" type: "Convolution" bottom: "conv7_1" top: "conv7_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 stride: 2 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv7_2_relu" type: "ReLU" bottom: "conv7_2" top: "conv7_2" } layer { name: "conv8_1" type: "Convolution" bottom: "conv7_2" top: "conv8_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 128 pad: 0 kernel_size: 1 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv8_1_relu" type: "ReLU" bottom: "conv8_1" top: "conv8_1" } layer { name: "conv8_2" type: "Convolution" bottom: "conv8_1" top: "conv8_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 0 kernel_size: 3 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv8_2_relu" type: "ReLU" bottom: "conv8_2" top: "conv8_2" } layer { name: "conv9_1" type: "Convolution" bottom: "conv8_2" top: "conv9_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 128 pad: 0 kernel_size: 1 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv9_1_relu" type: "ReLU" bottom: "conv9_1" top: "conv9_1" } layer { name: "conv9_2" type: "Convolution" bottom: "conv9_1" top: "conv9_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 0 kernel_size: 3 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv9_2_relu" type: "ReLU" bottom: "conv9_2" top: "conv9_2" } layer { name: "conv4_3_norm" type: "Normalize" bottom: "conv4_3" top: "conv4_3_norm" norm_param { across_spatial: false scale_filler { type: "constant" value: 20 } channel_shared: false } } layer { name: "conv4_3_norm_mbox_loc" type: "Convolution" bottom: "conv4_3_norm" top: "conv4_3_norm_mbox_loc" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 16 pad: 1 kernel_size: 3 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv4_3_norm_mbox_loc_perm" type: "Permute" bottom: "conv4_3_norm_mbox_loc" top: "conv4_3_norm_mbox_loc_perm" permute_param { order: 0 order: 2 order: 3 order: 1 } } layer { name: "conv4_3_norm_mbox_loc_flat" type: "Flatten" bottom: "conv4_3_norm_mbox_loc_perm" top: "conv4_3_norm_mbox_loc_flat" flatten_param { axis: 1 } } layer { name: "conv4_3_norm_mbox_conf" type: "Convolution" bottom: "conv4_3_norm" top: "conv4_3_norm_mbox_conf" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 84 pad: 1 kernel_size: 3 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv4_3_norm_mbox_conf_perm" type: "Permute" bottom: "conv4_3_norm_mbox_conf" top: "conv4_3_norm_mbox_conf_perm" permute_param { order: 0 order: 2 order: 3 order: 1 } } layer { name: "conv4_3_norm_mbox_conf_flat" type: "Flatten" bottom: "conv4_3_norm_mbox_conf_perm" top: "conv4_3_norm_mbox_conf_flat" flatten_param { axis: 1 } } layer { forward_type: FLOAT backward_type: FLOAT

name: "conv4_3_norm_mbox_priorbox" type: "PriorBox" bottom: "conv4_3_norm" bottom: "data" top: "conv4_3_norm_mbox_priorbox" prior_box_param { min_size: 30.0 max_size: 60.0 aspect_ratio: 2 flip: true clip: false variance: 0.1 variance: 0.1 variance: 0.2 variance: 0.2 step: 8 offset: 0.5 } } layer { name: "fc7_mbox_loc" type: "Convolution" bottom: "fc7" top: "fc7_mbox_loc" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 24 pad: 1 kernel_size: 3 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "fc7_mbox_loc_perm" type: "Permute" bottom: "fc7_mbox_loc" top: "fc7_mbox_loc_perm" permute_param { order: 0 order: 2 order: 3 order: 1 } } layer { name: "fc7_mbox_loc_flat" type: "Flatten" bottom: "fc7_mbox_loc_perm" top: "fc7_mbox_loc_flat" flatten_param { axis: 1 } } layer { name: "fc7_mbox_conf" type: "Convolution" bottom: "fc7" top: "fc7_mbox_conf" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 126 pad: 1 kernel_size: 3 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "fc7_mbox_conf_perm" type: "Permute" bottom: "fc7_mbox_conf" top: "fc7_mbox_conf_perm" permute_param { order: 0 order: 2 order: 3 order: 1 } } layer { name: "fc7_mbox_conf_flat" type: "Flatten" bottom: "fc7_mbox_conf_perm" top: "fc7_mbox_conf_flat" flatten_param { axis: 1 } } layer { forward_type: FLOAT backward_type: FLOAT

name: "fc7_mbox_priorbox" type: "PriorBox" bottom: "fc7" bottom: "data" top: "fc7_mbox_priorbox" prior_box_param { min_size: 60.0 max_size: 111.0 aspect_ratio: 2 aspect_ratio: 3 flip: true clip: false variance: 0.1 variance: 0.1 variance: 0.2 variance: 0.2 step: 16 offset: 0.5 } } layer { name: "conv6_2_mbox_loc" type: "Convolution" bottom: "conv6_2" top: "conv6_2_mbox_loc" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 24 pad: 1 kernel_size: 3 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv6_2_mbox_loc_perm" type: "Permute" bottom: "conv6_2_mbox_loc" top: "conv6_2_mbox_loc_perm" permute_param { order: 0 order: 2 order: 3 order: 1 } } layer { name: "conv6_2_mbox_loc_flat" type: "Flatten" bottom: "conv6_2_mbox_loc_perm" top: "conv6_2_mbox_loc_flat" flatten_param { axis: 1 } } layer { name: "conv6_2_mbox_conf" type: "Convolution" bottom: "conv6_2" top: "conv6_2_mbox_conf" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 126 pad: 1 kernel_size: 3 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv6_2_mbox_conf_perm" type: "Permute" bottom: "conv6_2_mbox_conf" top: "conv6_2_mbox_conf_perm" permute_param { order: 0 order: 2 order: 3 order: 1 } } layer { name: "conv6_2_mbox_conf_flat" type: "Flatten" bottom: "conv6_2_mbox_conf_perm" top: "conv6_2_mbox_conf_flat" flatten_param { axis: 1 } } layer {

forward_type: FLOAT backward_type: FLOAT

name: "conv6_2_mbox_priorbox" type: "PriorBox" bottom: "conv6_2" bottom: "data" top: "conv6_2_mbox_priorbox" prior_box_param { min_size: 111.0 max_size: 162.0 aspect_ratio: 2 aspect_ratio: 3 flip: true clip: false variance: 0.1 variance: 0.1 variance: 0.2 variance: 0.2 step: 32 offset: 0.5 } } layer { name: "conv7_2_mbox_loc" type: "Convolution" bottom: "conv7_2" top: "conv7_2_mbox_loc" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 24 pad: 1 kernel_size: 3 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv7_2_mbox_loc_perm" type: "Permute" bottom: "conv7_2_mbox_loc" top: "conv7_2_mbox_loc_perm" permute_param { order: 0 order: 2 order: 3 order: 1 } } layer { name: "conv7_2_mbox_loc_flat" type: "Flatten" bottom: "conv7_2_mbox_loc_perm" top: "conv7_2_mbox_loc_flat" flatten_param { axis: 1 } } layer { name: "conv7_2_mbox_conf" type: "Convolution" bottom: "conv7_2" top: "conv7_2_mbox_conf" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 126 pad: 1 kernel_size: 3 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv7_2_mbox_conf_perm" type: "Permute" bottom: "conv7_2_mbox_conf" top: "conv7_2_mbox_conf_perm" permute_param { order: 0 order: 2 order: 3 order: 1 } } layer { name: "conv7_2_mbox_conf_flat" type: "Flatten" bottom: "conv7_2_mbox_conf_perm" top: "conv7_2_mbox_conf_flat" flatten_param { axis: 1 } } layer { forward_type: FLOAT backward_type: FLOAT

name: "conv7_2_mbox_priorbox" type: "PriorBox" bottom: "conv7_2" bottom: "data" top: "conv7_2_mbox_priorbox" prior_box_param { min_size: 162.0 max_size: 213.0 aspect_ratio: 2 aspect_ratio: 3 flip: true clip: false variance: 0.1 variance: 0.1 variance: 0.2 variance: 0.2 step: 64 offset: 0.5 } } layer { name: "conv8_2_mbox_loc" type: "Convolution" bottom: "conv8_2" top: "conv8_2_mbox_loc" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 16 pad: 1 kernel_size: 3 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv8_2_mbox_loc_perm" type: "Permute" bottom: "conv8_2_mbox_loc" top: "conv8_2_mbox_loc_perm" permute_param { order: 0 order: 2 order: 3 order: 1 } } layer { name: "conv8_2_mbox_loc_flat" type: "Flatten" bottom: "conv8_2_mbox_loc_perm" top: "conv8_2_mbox_loc_flat" flatten_param { axis: 1 } } layer { name: "conv8_2_mbox_conf" type: "Convolution" bottom: "conv8_2" top: "conv8_2_mbox_conf" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 84 pad: 1 kernel_size: 3 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv8_2_mbox_conf_perm" type: "Permute" bottom: "conv8_2_mbox_conf" top: "conv8_2_mbox_conf_perm" permute_param { order: 0 order: 2 order: 3 order: 1 } } layer { name: "conv8_2_mbox_conf_flat" type: "Flatten" bottom: "conv8_2_mbox_conf_perm" top: "conv8_2_mbox_conf_flat" flatten_param { axis: 1 } } layer { name: "conv8_2_mbox_priorbox" type: "PriorBox" bottom: "conv8_2" bottom: "data" top: "conv8_2_mbox_priorbox" prior_box_param { min_size: 213.0 max_size: 264.0 aspect_ratio: 2 flip: true clip: false variance: 0.1 variance: 0.1 variance: 0.2 variance: 0.2 step: 100 offset: 0.5 } } layer { name: "conv9_2_mbox_loc" type: "Convolution" bottom: "conv9_2" top: "conv9_2_mbox_loc" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 16 pad: 1 kernel_size: 3 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv9_2_mbox_loc_perm" type: "Permute" bottom: "conv9_2_mbox_loc" top: "conv9_2_mbox_loc_perm" permute_param { order: 0 order: 2 order: 3 order: 1 } } layer { name: "conv9_2_mbox_loc_flat" type: "Flatten" bottom: "conv9_2_mbox_loc_perm" top: "conv9_2_mbox_loc_flat" flatten_param { axis: 1 } } layer { name: "conv9_2_mbox_conf" type: "Convolution" bottom: "conv9_2" top: "conv9_2_mbox_conf" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 84 pad: 1 kernel_size: 3 stride: 1 weight_filler { type: "xavier" } bias_filler { type: "constant" value: 0 } } } layer { name: "conv9_2_mbox_conf_perm" type: "Permute" bottom: "conv9_2_mbox_conf" top: "conv9_2_mbox_conf_perm" permute_param { order: 0 order: 2 order: 3 order: 1 } } layer { name: "conv9_2_mbox_conf_flat" type: "Flatten" bottom: "conv9_2_mbox_conf_perm" top: "conv9_2_mbox_conf_flat" flatten_param { axis: 1 } } layer { name: "conv9_2_mbox_priorbox" type: "PriorBox" bottom: "conv9_2" bottom: "data" top: "conv9_2_mbox_priorbox" prior_box_param { min_size: 264.0 max_size: 315.0 aspect_ratio: 2 flip: true clip: false variance: 0.1 variance: 0.1 variance: 0.2 variance: 0.2 step: 300 offset: 0.5 } } layer { name: "mbox_loc" type: "Concat" bottom: "conv4_3_norm_mbox_loc_flat" bottom: "fc7_mbox_loc_flat" bottom: "conv6_2_mbox_loc_flat" bottom: "conv7_2_mbox_loc_flat" bottom: "conv8_2_mbox_loc_flat" bottom: "conv9_2_mbox_loc_flat" top: "mbox_loc" concat_param { axis: 1 } } layer { name: "mbox_conf" type: "Concat" bottom: "conv4_3_norm_mbox_conf_flat" bottom: "fc7_mbox_conf_flat" bottom: "conv6_2_mbox_conf_flat" bottom: "conv7_2_mbox_conf_flat" bottom: "conv8_2_mbox_conf_flat" bottom: "conv9_2_mbox_conf_flat" top: "mbox_conf" concat_param { axis: 1 } } layer { name: "mbox_priorbox" type: "Concat" bottom: "conv4_3_norm_mbox_priorbox" bottom: "fc7_mbox_priorbox" bottom: "conv6_2_mbox_priorbox" bottom: "conv7_2_mbox_priorbox" bottom: "conv8_2_mbox_priorbox" bottom: "conv9_2_mbox_priorbox" top: "mbox_priorbox" concat_param { axis: 2 } } layer { forward_type: FLOAT backward_type: FLOAT

name: "mbox_loss" type: "MultiBoxLoss" bottom: "mbox_loc" bottom: "mbox_conf" bottom: "mbox_priorbox" bottom: "label" top: "mbox_loss" include { phase: TRAIN } propagate_down: true propagate_down: true propagate_down: false propagate_down: false loss_param { normalization: VALID } multibox_loss_param { loc_loss_type: SMOOTH_L1 conf_loss_type: SOFTMAX loc_weight: 1.0 num_classes: 21 share_location: true match_type: PER_PREDICTION overlap_threshold: 0.5 use_prior_for_matching: true background_label_id: 0 use_difficult_gt: true neg_pos_ratio: 3.0 neg_overlap: 0.5 code_type: CENTER_SIZE ignore_cross_boundary_bbox: false mining_type: MAX_NEGATIVE } }

drnikolaev commented 5 years ago

Hi @Seojin24 could you try this please:

#global_grad_scale: 1000.
global_grad_scale_adaptive: true
cathy-kim commented 5 years ago

Hi @Seojin24 could you try this please:

#global_grad_scale: 1000.
global_grad_scale_adaptive: true

Thank you for answer. The problem was solved when I added "default_forward_math: FLOAT16 default_backward_math: FLOAT16"
and "Foward_type: FLOAT, Backward_type:FLOAT" to every PriorBox layer.

However, the pretrained model issue was not solved. Is a pretrained model also should have been trained as FP16 when I want to use it in FP16 training mode?