tensorflow / models

Models and examples built with TensorFlow
Other
76.96k stars 45.79k forks source link

Mobilenet v1 for Faster r-cnn has problem in training. #4250

Closed yryun closed 6 years ago

yryun commented 6 years ago

System information

Describe the problem

I've been training Faster R-CNN with mobilenet feature extractor for 400k iteration, batch 8, learning rate 3e-03. But it's mAP is almost zero. I'm using my own dataset and it's going well with inceptionV2, resnet50, 101. Only mobilenet has problem.... It seems like first stage loss too high so that second stage dose not processed most of the time. I tried stride size 16 and 8 but same problem.

Source code / logs

here is loss of first and second stage. image image image

here is my tensorboard distributions. Compared to Incepction V2, mobilenet has No change in second stage conv2d_12, conv2d_13 - moving_mean, moving_variance. It could be a clue

image

here is my config file

model {
  faster_rcnn {
    #mobile16 + size_+ +512depth+ crop10 + achor box2 + proposal 100
    num_classes: 2
    image_resizer {
      fixed_shape_resizer {
        height: 288
        width: 960
      }
    }
    feature_extractor {
      type: "faster_rcnn_mobilenet"
      first_stage_features_stride: 16
    }
    first_stage_anchor_generator {
      grid_anchor_generator {
        height_stride: 16
        width_stride: 16

        scales: 0.125
        scales: 0.25
        scales: 0.5
        scales: 1.0
        scales: 1.5
        scales: 2.0

        aspect_ratios: 0.48
        aspect_ratios: 0.65
        aspect_ratios: 1.09
        aspect_ratios: 1.28
    aspect_ratios: 1.48
        aspect_ratios: 1.70

      }
    }
    first_stage_box_predictor_conv_hyperparams {
      op: CONV
      regularizer {
        l2_regularizer {
          weight: 0.0
        }
      }
      initializer {
        truncated_normal_initializer {
          stddev: 0.00999999977648
        }
      }
    }
    first_stage_nms_score_threshold: 0.0
    first_stage_nms_iou_threshold: 0.699999988079
    first_stage_max_proposals: 100
    first_stage_localization_loss_weight: 2.0
    first_stage_objectness_loss_weight: 1.0

    first_stage_box_predictor_depth: 512

    initial_crop_size: 10
    maxpool_kernel_size: 2
    maxpool_stride: 2
    second_stage_box_predictor {
      mask_rcnn_box_predictor {
        fc_hyperparams {
          op: FC
          regularizer {
            l2_regularizer {
              weight: 0.0
            }
          }
          initializer {
            variance_scaling_initializer {
              factor: 1.0
              uniform: true
              mode: FAN_AVG
            }
          }
        }
        #use_dropout: true
        #dropout_keep_probability: 0.8
      }
    }
    second_stage_post_processing {
      batch_non_max_suppression {
        score_threshold: 0.0
        iou_threshold: 0.600000023842
        max_detections_per_class: 100
        max_total_detections: 100
      }
      score_converter: SOFTMAX
    }
    second_stage_localization_loss_weight: 2.0
    second_stage_classification_loss_weight: 1.0
  }
}
train_config {
  batch_size: 8
  data_augmentation_options {
    random_horizontal_flip {
    }
  }

  data_augmentation_options {
    random_crop_pad_image {
    }
  }
  optimizer {
    momentum_optimizer {
      learning_rate {
        manual_step_learning_rate {
          initial_learning_rate: 3e-03

          schedule {
            step: 400000
            learning_rate: 3e-04
          }
          schedule {
            step: 1200000
            learning_rate: 3e-05
          }
          schedule {
            step: 1500000
            learning_rate: 3e-06
          }
        }
      }
      momentum_optimizer_value: 0.899999976158
    }
    use_moving_average: false
  }
  gradient_clipping_by_norm: 10.0
  from_detection_checkpoint: false
  num_steps: 1600000
}
train_input_reader {
  label_map_path: "/home/yryun/fasterRCNN/models/research/object_detection/data/kitti_label_map_v3.pbtxt"
  tf_record_input_reader {
    input_path: "/home/yryun/fasterRCNN/models/research/object_detection/yeongro/blackbox_train_v3.record_train.tfrecord"
  }
  shuffle: true
}
eval_config {
  num_examples: 592
  metrics_set: "pascal_voc_metrics"
  use_moving_averages: false
}
eval_input_reader {
  label_map_path: "/home/yryun/fasterRCNN/models/research/object_detection/data/kitti_label_map_v3.pbtxt"
  tf_record_input_reader {
    input_path: "/home/yryun/fasterRCNN/models/research/object_detection/yeongro/blackbox_test_v3.record_train.tfrecord"
  }
  shuffle: false
}
robieta commented 6 years ago

This appears to be a hyperparameter tuning problem rather than a bug or feature with the object detection code itself. Such support questions are outside the purview of this repo's issues, and are better asked on the TensorFlow Stack Overflow page. There is also a larger community that reads questions there.

If you think we've misinterpreted a bug, please comment again with a clear explanation, as well as all of the information requested in the issue template. Thanks!