ouzzane commented 4 years ago

I trained the DeepLab V3 + network on my own dataset, but after the test I had black images after ten iteration of the training. then I did several tests, where I took into account that the training was working fine at the start but apartere from [6-10] training was starting to diverge towards a black image. the source code: https://github.com/rishizek/tensorflow-deeplab-v3-plus so i want to know where the problem is coming from, can you help me please?

ravikyram commented 4 years ago

@ouzzane

Please, fill issue template..

Can you please share colab link or code snippet to reproduce the issue in our environment.It helps us in localizing the issue faster.Thanks!

ouzzane commented 4 years ago

train

from future import absolute_import from future import division from future import print_function

import argparse import os import sys

import tensorflow as tf import deeplab_model from utils import preprocessing from tensorflow.python import debug as tf_debug import matplotlib.pyplot as plt import shutil parser = argparse.ArgumentParser()

parser.add_argument('--model_dir', type=str, default='./model/', help='Base directory for the model.')

parser.add_argument('--clean_model_dir', action='store_true', help='Whether to clean up the model directory if present.')

parser.add_argument('--train_epochs', type=int, default=26, help='Number of training epochs: ' 'For 30K iteration with batch size 6, train_epoch = 17.01 (= 30K 6 / 10,582). ' 'For 30K iteration with batch size 8, train_epoch = 22.68 (= 30K 8 / 10,582). ' 'For 30K iteration with batch size 10, train_epoch = 25.52 (= 30K 10 / 10,582). ' 'For 30K iteration with batch size 11, train_epoch = 31.19 (= 30K 11 / 10,582). ' 'For 30K iteration with batch size 15, train_epoch = 42.53 (= 30K 15 / 10,582). ' 'For 30K iteration with batch size 16, train_epoch = 45.36 (= 30K 16 / 10,582).')

parser.add_argument('--epochs_per_eval', type=int, default=1, help='The number of training epochs to run between evaluations.')

parser.add_argument('--tensorboard_images_max_outputs', type=int, default=6, help='Max number of batch elements to generate for Tensorboard.')

parser.add_argument('--batch_size', type=int, default=2, help='Number of examples per batch.')

parser.add_argument('--learning_rate_policy', type=str, default='poly', choices=['poly', 'piecewise'], help='Learning rate policy to optimize loss.')

parser.add_argument('--max_iter', type=int, default=60000, help='Number of maximum iteration used for "poly" learning rate policy.')

parser.add_argument('--data_dir', type=str, default='./dataset/', help='Path to the directory containing the PASCAL VOC data tf record.')

parser.add_argument('--base_architecture', type=str, default='resnet_v2_101', choices=['resnet_v2_50', 'resnet_v2_101'], help='The architecture of base Resnet building block.')

parser.add_argument('--pre_trained_model', type=str, default='./ini_checkpoints/resnet_v2_101/resnet_v2_101.ckpt', help='Path to the pre-trained model checkpoint.')

parser.add_argument('--output_stride', type=int, default=16, choices=[8, 16], help='Output stride for DeepLab v3. Currently 8 or 16 is supported.')

parser.add_argument('--freeze_batch_norm', action='store_true', help='Freeze batch normalization parameters during the training.')

parser.add_argument('--initial_learning_rate', type=float, default=0.01 , help='Initial learning rate for the optimizer.')

parser.add_argument('--end_learning_rate', type=float, default=1e-4, help='End learning rate for the optimizer.')

parser.add_argument('--initial_global_step', type=int, default=0, help='Initial global step for controlling learning rate when fine-tuning model.')

parser.add_argument('--weight_decay', type=float, default=2e-4, help='The weight decay to use for regularizing the model.')

parser.add_argument('--debug', action='store_true', help='Whether to use debugger to track down bad values during training.')

_NUM_CLASSES = 6 _HEIGHT =576 _WIDTH = 432 _DEPTH = 3 _MIN_SCALE = 0.5 _MAX_SCALE = 2.0 _IGNORE_LABEL = 255

_POWER = 0.9 _MOMENTUM = 0.9

_BATCH_NORM_DECAY = 0.9997

_NUM_IMAGES = { 'train': 1008, 'validation': 120, }

def get_filenames(is_training, data_dir): """Return a list of filenames.

Args: is_training: A boolean denoting whether the input is for training. data_dir: path to the the directory containing the input data.

Returns: A list of file names. """ if is_training: return [os.path.join(data_dir, 'PQR/tfrecord/PQR_train.record')] else: return [os.path.join(data_dir, 'PQR/tfrecord/PQR_val.record')]

def parse_record(raw_record): """Parse PASCAL image and label from a tf record.""" keys_to_features = { 'image/height': tf.FixedLenFeature((), tf.int64), 'image/width': tf.FixedLenFeature((), tf.int64), 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), 'label/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'label/format': tf.FixedLenFeature((), tf.string, default_value='png'), }

parsed = tf.parse_single_example(raw_record, keys_to_features)

height = tf.cast(parsed['image/height'], tf.int32)

width = tf.cast(parsed['image/width'], tf.int32)

image = tf.image.decode_image( tf.reshape(parsed['image/encoded'], shape=[]), _DEPTH) image = tf.to_float(tf.image.convert_image_dtype(image, dtype=tf.uint8)) image.set_shape([None, None, 3])

label = tf.image.decode_image( tf.reshape(parsed['label/encoded'], shape=[]), 1) label = tf.to_int32(tf.image.convert_image_dtype(label, dtype=tf.uint8)) label.set_shape([None, None, 1])

return image, label

def preprocess_image(image, label, is_training): """Preprocess a single image of layout [height, width, depth].""" if is_training:

Randomly scale the image and label.

image, label = preprocessing.random_rescale_image_and_label(
    image, label, _MIN_SCALE, _MAX_SCALE)

# Randomly crop or pad a [_HEIGHT, _WIDTH] section of the image and label.
image, label = preprocessing.random_crop_or_pad_image_and_label(
    image, label, _HEIGHT, _WIDTH, _IGNORE_LABEL)

# Randomly flip the image and label horizontally.
image, label = preprocessing.random_flip_left_right_image_and_label(
    image, label)

image.set_shape([_HEIGHT, _WIDTH, 3])
label.set_shape([_HEIGHT, _WIDTH, 1])

image = preprocessing.mean_image_subtraction(image)

return image, label

def input_fn(is_training, data_dir, batch_size, num_epochs=1): """Input_fn using the tf.data input pipeline for CIFAR-10 dataset.

Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset.

Returns: A tuple of images and labels. """ dataset = tf.data.Dataset.from_tensor_slices(get_filenames(is_training, data_dir)) dataset = dataset.flat_map(tf.data.TFRecordDataset)

if is_training:

When choosing shuffle buffer sizes, larger sizes result in better

# randomness, while smaller sizes have better performance.
# is a relatively small dataset, we choose to shuffle the full epoch.
dataset = dataset.shuffle(buffer_size=_NUM_IMAGES['train'])

dataset = dataset.map(parse_record) dataset = dataset.map( lambda image, label: preprocess_image(image, label, is_training)) dataset = dataset.prefetch(batch_size)

We call repeat after shuffling, rather than before, to prevent separate

epochs from blending together.

dataset = dataset.repeat(num_epochs) dataset = dataset.batch(batch_size)

iterator = dataset.make_one_shot_iterator() images, labels = iterator.get_next()

return images, labels

def main(unused_argv):

Using the Winograd non-fused algorithms provides a small performance boost.

os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

if FLAGS.clean_model_dir: shutil.rmtree(FLAGS.model_dir, ignore_errors=True)

Set up a RunConfig to only save checkpoints once per training cycle.

run_config = tf.estimator.RunConfig().replace(save_checkpoints_secs=1e9) model = tf.estimator.Estimator( model_fn=deeplab_model.deeplabv3_plus_model_fn, model_dir=FLAGS.model_dir, config=run_config, params={ 'output_stride': FLAGS.output_stride, 'batch_size': FLAGS.batch_size, 'base_architecture': FLAGS.base_architecture, 'pre_trained_model': FLAGS.pre_trained_model, 'batch_norm_decay': _BATCH_NORM_DECAY, 'num_classes': _NUM_CLASSES, 'tensorboard_images_max_outputs': FLAGS.tensorboard_images_max_outputs, 'weight_decay': FLAGS.weight_decay, 'learning_rate_policy': FLAGS.learning_rate_policy, 'num_train': _NUM_IMAGES['train'], 'initial_learning_rate': FLAGS.initial_learning_rate, 'max_iter': FLAGS.max_iter, 'end_learning_rate': FLAGS.end_learning_rate, 'power': _POWER, 'momentum': _MOMENTUM, 'freeze_batch_norm': FLAGS.freeze_batch_norm, 'initial_global_step': FLAGS.initial_global_step })

for _ in range(FLAGS.train_epochs // FLAGS.epochs_per_eval): tensors_to_log = { 'learning_rate': 'learning_rate', 'cross_entropy': 'cross_entropy', 'train_px_accuracy': 'train_px_accuracy', 'train_mean_iou': 'train_mean_iou', }

logging_hook = tf.train.LoggingTensorHook(
    tensors=tensors_to_log, every_n_iter=10)
train_hooks = [logging_hook]
eval_hooks = None

if FLAGS.debug:
  debug_hook = tf_debug.LocalCLIDebugHook()
  train_hooks.append(debug_hook)
  eval_hooks = [debug_hook]

tf.logging.info("Start training.")
model.train(
    input_fn=lambda: input_fn(True, FLAGS.data_dir, FLAGS.batch_size, FLAGS.epochs_per_eval),
    hooks=train_hooks,
    # steps=1  # For debug
)

tf.logging.info("Start evaluation.")
# Evaluate the model and print results
eval_results = model.evaluate(
    # Batch size must be 1 for testing because the images' size differs
    input_fn=lambda: input_fn(False, FLAGS.data_dir, 1),
    hooks=eval_hooks,
    # steps=1  # For debug
)
print(eval_results)

if name == 'main': tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) FLAGS, unparsed = parser.parse_known_args() tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)

ravikyram commented 4 years ago

@ouzzane

Which tensorflow version you are using?

Thanks!

ouzzane commented 4 years ago

Hello, thank you for your reply ravikyram I am using tensorflow 1.15 Best regards

lxyzler commented 4 years ago

I have the same problem. Have you solved it @ouzzane

ouzzane commented 4 years ago

@ lxyzler unfortunately not yet

tensorflow / models

DeepLab V3 + predict a black mask #9099

train

height = tf.cast(parsed['image/height'], tf.int32)

width = tf.cast(parsed['image/width'], tf.int32)

Randomly scale the image and label.

When choosing shuffle buffer sizes, larger sizes result in better

We call repeat after shuffling, rather than before, to prevent separate

epochs from blending together.

Using the Winograd non-fused algorithms provides a small performance boost.

Set up a RunConfig to only save checkpoints once per training cycle.