tensorflow / models

Models and examples built with TensorFlow
Other
77.16k stars 45.76k forks source link

DeepLab V3 + predict a black mask #9099

Open ouzzane opened 4 years ago

ouzzane commented 4 years ago

I trained the DeepLab V3 + network on my own dataset, but after the test I had black images after ten iteration of the training. then I did several tests, where I took into account that the training was working fine at the start but apartere from [6-10] training was starting to diverge towards a black image. the source code: https://github.com/rishizek/tensorflow-deeplab-v3-plus so i want to know where the problem is coming from, can you help me please?

ravikyram commented 4 years ago

@ouzzane

Please, fill issue template..

Can you please share colab link or code snippet to reproduce the issue in our environment.It helps us in localizing the issue faster.Thanks!

ouzzane commented 4 years ago

train

from future import absolute_import from future import division from future import print_function

import argparse import os import sys

import tensorflow as tf import deeplab_model from utils import preprocessing from tensorflow.python import debug as tf_debug import matplotlib.pyplot as plt import shutil parser = argparse.ArgumentParser()

parser.add_argument('--model_dir', type=str, default='./model/', help='Base directory for the model.')

parser.add_argument('--clean_model_dir', action='store_true', help='Whether to clean up the model directory if present.')

parser.add_argument('--train_epochs', type=int, default=26, help='Number of training epochs: ' 'For 30K iteration with batch size 6, train_epoch = 17.01 (= 30K 6 / 10,582). ' 'For 30K iteration with batch size 8, train_epoch = 22.68 (= 30K 8 / 10,582). ' 'For 30K iteration with batch size 10, train_epoch = 25.52 (= 30K 10 / 10,582). ' 'For 30K iteration with batch size 11, train_epoch = 31.19 (= 30K 11 / 10,582). ' 'For 30K iteration with batch size 15, train_epoch = 42.53 (= 30K 15 / 10,582). ' 'For 30K iteration with batch size 16, train_epoch = 45.36 (= 30K 16 / 10,582).')

parser.add_argument('--epochs_per_eval', type=int, default=1, help='The number of training epochs to run between evaluations.')

parser.add_argument('--tensorboard_images_max_outputs', type=int, default=6, help='Max number of batch elements to generate for Tensorboard.')

parser.add_argument('--batch_size', type=int, default=2, help='Number of examples per batch.')

parser.add_argument('--learning_rate_policy', type=str, default='poly', choices=['poly', 'piecewise'], help='Learning rate policy to optimize loss.')

parser.add_argument('--max_iter', type=int, default=60000, help='Number of maximum iteration used for "poly" learning rate policy.')

parser.add_argument('--data_dir', type=str, default='./dataset/', help='Path to the directory containing the PASCAL VOC data tf record.')

parser.add_argument('--base_architecture', type=str, default='resnet_v2_101', choices=['resnet_v2_50', 'resnet_v2_101'], help='The architecture of base Resnet building block.')

parser.add_argument('--pre_trained_model', type=str, default='./ini_checkpoints/resnet_v2_101/resnet_v2_101.ckpt', help='Path to the pre-trained model checkpoint.')

parser.add_argument('--output_stride', type=int, default=16, choices=[8, 16], help='Output stride for DeepLab v3. Currently 8 or 16 is supported.')

parser.add_argument('--freeze_batch_norm', action='store_true', help='Freeze batch normalization parameters during the training.')

parser.add_argument('--initial_learning_rate', type=float, default=0.01 , help='Initial learning rate for the optimizer.')

parser.add_argument('--end_learning_rate', type=float, default=1e-4, help='End learning rate for the optimizer.')

parser.add_argument('--initial_global_step', type=int, default=0, help='Initial global step for controlling learning rate when fine-tuning model.')

parser.add_argument('--weight_decay', type=float, default=2e-4, help='The weight decay to use for regularizing the model.')

parser.add_argument('--debug', action='store_true', help='Whether to use debugger to track down bad values during training.')

_NUM_CLASSES = 6 _HEIGHT =576 _WIDTH = 432 _DEPTH = 3 _MIN_SCALE = 0.5 _MAX_SCALE = 2.0 _IGNORE_LABEL = 255

_POWER = 0.9 _MOMENTUM = 0.9

_BATCH_NORM_DECAY = 0.9997

_NUM_IMAGES = { 'train': 1008, 'validation': 120, }

def get_filenames(is_training, data_dir): """Return a list of filenames.

Args: is_training: A boolean denoting whether the input is for training. data_dir: path to the the directory containing the input data.

Returns: A list of file names. """ if is_training: return [os.path.join(data_dir, 'PQR/tfrecord/PQR_train.record')] else: return [os.path.join(data_dir, 'PQR/tfrecord/PQR_val.record')]

def parse_record(raw_record): """Parse PASCAL image and label from a tf record.""" keys_to_features = { 'image/height': tf.FixedLenFeature((), tf.int64), 'image/width': tf.FixedLenFeature((), tf.int64), 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), 'label/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'label/format': tf.FixedLenFeature((), tf.string, default_value='png'), }

parsed = tf.parse_single_example(raw_record, keys_to_features)

height = tf.cast(parsed['image/height'], tf.int32)

width = tf.cast(parsed['image/width'], tf.int32)

image = tf.image.decode_image( tf.reshape(parsed['image/encoded'], shape=[]), _DEPTH) image = tf.to_float(tf.image.convert_image_dtype(image, dtype=tf.uint8)) image.set_shape([None, None, 3])

label = tf.image.decode_image( tf.reshape(parsed['label/encoded'], shape=[]), 1) label = tf.to_int32(tf.image.convert_image_dtype(label, dtype=tf.uint8)) label.set_shape([None, None, 1])

return image, label

def preprocess_image(image, label, is_training): """Preprocess a single image of layout [height, width, depth].""" if is_training:

Randomly scale the image and label.

image, label = preprocessing.random_rescale_image_and_label(
    image, label, _MIN_SCALE, _MAX_SCALE)

# Randomly crop or pad a [_HEIGHT, _WIDTH] section of the image and label.
image, label = preprocessing.random_crop_or_pad_image_and_label(
    image, label, _HEIGHT, _WIDTH, _IGNORE_LABEL)

# Randomly flip the image and label horizontally.
image, label = preprocessing.random_flip_left_right_image_and_label(
    image, label)

image.set_shape([_HEIGHT, _WIDTH, 3])
label.set_shape([_HEIGHT, _WIDTH, 1])

image = preprocessing.mean_image_subtraction(image)

return image, label

def input_fn(is_training, data_dir, batch_size, num_epochs=1): """Input_fn using the tf.data input pipeline for CIFAR-10 dataset.

Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset.

Returns: A tuple of images and labels. """ dataset = tf.data.Dataset.from_tensor_slices(get_filenames(is_training, data_dir)) dataset = dataset.flat_map(tf.data.TFRecordDataset)

if is_training:

When choosing shuffle buffer sizes, larger sizes result in better

# randomness, while smaller sizes have better performance.
# is a relatively small dataset, we choose to shuffle the full epoch.
dataset = dataset.shuffle(buffer_size=_NUM_IMAGES['train'])

dataset = dataset.map(parse_record) dataset = dataset.map( lambda image, label: preprocess_image(image, label, is_training)) dataset = dataset.prefetch(batch_size)

We call repeat after shuffling, rather than before, to prevent separate

epochs from blending together.

dataset = dataset.repeat(num_epochs) dataset = dataset.batch(batch_size)

iterator = dataset.make_one_shot_iterator() images, labels = iterator.get_next()

return images, labels

def main(unused_argv):

Using the Winograd non-fused algorithms provides a small performance boost.

os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

if FLAGS.clean_model_dir: shutil.rmtree(FLAGS.model_dir, ignore_errors=True)

Set up a RunConfig to only save checkpoints once per training cycle.

run_config = tf.estimator.RunConfig().replace(save_checkpoints_secs=1e9) model = tf.estimator.Estimator( model_fn=deeplab_model.deeplabv3_plus_model_fn, model_dir=FLAGS.model_dir, config=run_config, params={ 'output_stride': FLAGS.output_stride, 'batch_size': FLAGS.batch_size, 'base_architecture': FLAGS.base_architecture, 'pre_trained_model': FLAGS.pre_trained_model, 'batch_norm_decay': _BATCH_NORM_DECAY, 'num_classes': _NUM_CLASSES, 'tensorboard_images_max_outputs': FLAGS.tensorboard_images_max_outputs, 'weight_decay': FLAGS.weight_decay, 'learning_rate_policy': FLAGS.learning_rate_policy, 'num_train': _NUM_IMAGES['train'], 'initial_learning_rate': FLAGS.initial_learning_rate, 'max_iter': FLAGS.max_iter, 'end_learning_rate': FLAGS.end_learning_rate, 'power': _POWER, 'momentum': _MOMENTUM, 'freeze_batch_norm': FLAGS.freeze_batch_norm, 'initial_global_step': FLAGS.initial_global_step })

for _ in range(FLAGS.train_epochs // FLAGS.epochs_per_eval): tensors_to_log = { 'learning_rate': 'learning_rate', 'cross_entropy': 'cross_entropy', 'train_px_accuracy': 'train_px_accuracy', 'train_mean_iou': 'train_mean_iou', }

logging_hook = tf.train.LoggingTensorHook(
    tensors=tensors_to_log, every_n_iter=10)
train_hooks = [logging_hook]
eval_hooks = None

if FLAGS.debug:
  debug_hook = tf_debug.LocalCLIDebugHook()
  train_hooks.append(debug_hook)
  eval_hooks = [debug_hook]

tf.logging.info("Start training.")
model.train(
    input_fn=lambda: input_fn(True, FLAGS.data_dir, FLAGS.batch_size, FLAGS.epochs_per_eval),
    hooks=train_hooks,
    # steps=1  # For debug
)

tf.logging.info("Start evaluation.")
# Evaluate the model and print results
eval_results = model.evaluate(
    # Batch size must be 1 for testing because the images' size differs
    input_fn=lambda: input_fn(False, FLAGS.data_dir, 1),
    hooks=eval_hooks,
    # steps=1  # For debug
)
print(eval_results)

if name == 'main': tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) FLAGS, unparsed = parser.parse_known_args() tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)

ravikyram commented 4 years ago

@ouzzane

Which tensorflow version you are using?

Thanks!

ouzzane commented 4 years ago

Hello, thank you for your reply ravikyram I am using tensorflow 1.15 Best regards

lxyzler commented 4 years ago

I have the same problem. Have you solved it @ouzzane

ouzzane commented 4 years ago

@ lxyzler unfortunately not yet