It is possible save all model and recover training?

adriaciurana commented 5 years ago

Hello, I'm trying to save all the model (include the optimizer) in order to continue the training at same state that the last checkpoint. I see that in the library there aren't any method to save the whole model.

For now, I have tried to change the CheckpointModel to save_weights_only = False, but I get the following error:

Traceback (most recent call last):
  File "./mrcnn/train.py", line 264, in <module>
    train(model)
  File "./mrcnn/train.py", line 173, in train
    save_weights_only=False)
  File "/media/Datos/git/bdeo/stream_learning/mrcnn/model.py", line 2551, in train
    use_multiprocessing=True,
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/site-packages/keras/engine/training.py", line 1418, in fit_generator
    initial_epoch=initial_epoch)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/site-packages/keras/engine/training_generator.py", line 251, in fit_generator
    callbacks.on_epoch_end(epoch, epoch_logs)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/site-packages/keras/callbacks.py", line 79, in on_epoch_end
    callback.on_epoch_end(epoch, logs)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/site-packages/keras/callbacks.py", line 457, in on_epoch_end
    self.model.save(filepath, overwrite=True)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/site-packages/keras/engine/network.py", line 1090, in save
    save_model(self, filepath, overwrite, include_optimizer)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/site-packages/keras/engine/saving.py", line 382, in save_model
    _serialize_model(model, f, include_optimizer)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/site-packages/keras/engine/saving.py", line 83, in _serialize_model
    model_config['config'] = model.get_config()
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/site-packages/keras/engine/network.py", line 931, in get_config
    return copy.deepcopy(config)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 155, in deepcopy
    y = copier(x, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 243, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 155, in deepcopy
    y = copier(x, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 218, in _deepcopy_list
    y.append(deepcopy(a, memo))
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 155, in deepcopy
    y = copier(x, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 243, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 155, in deepcopy
    y = copier(x, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 243, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 155, in deepcopy
    y = copier(x, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 223, in _deepcopy_tuple
    y = [deepcopy(a, memo) for a in x]
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 223, in <listcomp>
    y = [deepcopy(a, memo) for a in x]
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 155, in deepcopy
    y = copier(x, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 223, in _deepcopy_tuple
    y = [deepcopy(a, memo) for a in x]
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 223, in <listcomp>
    y = [deepcopy(a, memo) for a in x]
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 182, in deepcopy
    y = _reconstruct(x, rv, 1, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 297, in _reconstruct
    state = deepcopy(state, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 155, in deepcopy
    y = copier(x, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 243, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 182, in deepcopy
    y = _reconstruct(x, rv, 1, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 297, in _reconstruct
    state = deepcopy(state, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 155, in deepcopy
    y = copier(x, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 243, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 182, in deepcopy
    y = _reconstruct(x, rv, 1, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 297, in _reconstruct
    state = deepcopy(state, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 155, in deepcopy
    y = copier(x, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 243, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 182, in deepcopy
    y = _reconstruct(x, rv, 1, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 297, in _reconstruct
    state = deepcopy(state, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 155, in deepcopy
    y = copier(x, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 243, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 182, in deepcopy
    y = _reconstruct(x, rv, 1, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 291, in _reconstruct
    args = deepcopy(args, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 155, in deepcopy
    y = copier(x, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 223, in _deepcopy_tuple
    y = [deepcopy(a, memo) for a in x]
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 223, in <listcomp>
    y = [deepcopy(a, memo) for a in x]
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 155, in deepcopy
    y = copier(x, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 218, in _deepcopy_list
    y.append(deepcopy(a, memo))
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 182, in deepcopy
    y = _reconstruct(x, rv, 1, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 297, in _reconstruct
    state = deepcopy(state, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 155, in deepcopy
    y = copier(x, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 243, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 155, in deepcopy
    y = copier(x, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 218, in _deepcopy_list
    y.append(deepcopy(a, memo))
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 155, in deepcopy
    y = copier(x, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 223, in _deepcopy_tuple
    y = [deepcopy(a, memo) for a in x]
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 223, in <listcomp>
    y = [deepcopy(a, memo) for a in x]
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 155, in deepcopy
    y = copier(x, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 243, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 182, in deepcopy
    y = _reconstruct(x, rv, 1, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 297, in _reconstruct
    state = deepcopy(state, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 155, in deepcopy
    y = copier(x, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 243, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 182, in deepcopy
    y = _reconstruct(x, rv, 1, memo)
  File "/home/totolia/anaconda2/envs/keras3/lib/python3.5/copy.py", line 306, in _reconstruct
    y.__dict__.update(state)
AttributeError: 'NoneType' object has no attribute 'update'

Honestly I do not know how to correct this problem.

skywalkerisnull commented 5 years ago

I did the exact same thing and got the same errors, I am now trying to track down where it actually saves out the model and create a loop that if it is the last epoch to be trained, save out the entire model with: Model weights. Model architecture. Model compilation details (loss and metrics). Model optimizer state.

Did you have any luck in figuring it out?

adriaciurana commented 5 years ago

I tried to save the optimizer at the same time as the model weights. Using what they comment here:

https://stackoverflow.com/questions/49503748/save-and-load-model-optimizer-state

Then create a custom ModelCheckpoint that saved both things: weights in h5 and optomizer in pickle. Unfortunately there is something in the model that does not allow to correctly save the weights in the optimizer and also gets and errors. I do not remember the error because I did not keep it but dismiss the idea.

skywalkerisnull commented 5 years ago

I found a slightly different solution, using the following:

def freeze_model(model, name, model_save_dir):
    frozen_graph = freeze_session(sess, output_names=[out.op.name for out in model.outputs][:4])
    tf.train.write_graph(frozen_graph, model_save_dir, name , as_text=False)
    print("*"*80)
    print("Finish converting keras model to Frozen PB")
    print("*" * 80)

def make_serving_ready(model_save_dir, save_serve_path, version_number):
    import tensorflow as tf

    export_dir = os.path.join(save_serve_path, str(version_number))
    graph_pb = model_save_dir

    builder = tf.saved_model.builder.SavedModelBuilder(export_dir)

    with tf.gfile.GFile(graph_pb, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())

    sigs = {}

    with tf.Session(graph=tf.Graph()) as sess:
        tf.import_graph_def(graph_def, name="")
        g = tf.get_default_graph()
        input_image = g.get_tensor_by_name("input_image:0")
        input_image_meta = g.get_tensor_by_name("input_image_meta:0")
        input_anchors = g.get_tensor_by_name("input_anchors:0")

        output_detection = g.get_tensor_by_name("mrcnn_detection/Reshape_1:0")
        output_mask = g.get_tensor_by_name("mrcnn_mask/Reshape_1:0")

        sigs[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = tf.saved_model.signature_def_utils.predict_signature_def(
                {"input_image": input_image, 'input_image_meta': input_image_meta, 'input_anchors': input_anchors},
                {"mrcnn_detection/Reshape_1": output_detection, 'mrcnn_mask/Reshape_1': output_mask})

        builder.add_meta_graph_and_variables(sess, [tag_constants.SERVING], signature_def_map=sigs)

    builder.save()
    print("*" * 80)
    print("FINISH CONVERTING FROZEN PB TO SERVING READY")
    print("*" * 80)

def production_ready(model_path:str, frozen_name:str, model_save_dir:str, version_number:int):
    sess = tf.Session()
    K.set_session(sess)
    config = AssetConfig()

    # LOAD MODEL
    model = modellib.MaskRCNN(mode="inference", model_dir=model_path, config=config)
    model.load_weights(model_path, by_name=True, exclude=["mrcnn_class_logits", "mrcnn_bbox_fc", "mrcnn_bbox", "mrcnn_mask"])
    # Converting keras model to PB frozen graph
    freeze_model(model.keras_model, frozen_name, model_save_dir)

    # Now convert frozen graph to Tensorflow Serving Ready
    make_serving_ready(os.path.join(model_save_dir, frozen_name), model_save_dir, version_number)

    print("Frozen Model saved here: ", model_save_dir)
    print("Serving Model saved here: ", model_save_dir)

and can be run with something along the lines of:

model_path = "D:\\Path\\To\\Training\\Folder\\assetsDATETTIME\\mask_rcnn_assets_EPOCHNUM.h5"
frozen_name = "export_model_name.pb"
model_save_dir = "D:\\Path\\To\\Freezing\\Folder"
version_number = 1

production_ready(model_path, frozen_name, model_save_dir, version_number)

This saves it out ready to be used in the Tensorflow Serving format.

HAMZARaouia commented 5 years ago

You can always save the last weights and then restart training from it.

davidvirdeforshh commented 5 years ago

I tried this and got the following error. Any ideas what the reason could be?

tensorflow.python.framework.errors_impl.FailedPreconditionError: 2 root error(s) found.
  (0) Failed precondition: Attempting to use uninitialized value bn4m_branch2a/moving_mean
         [[{{node bn4m_branch2a/moving_mean}}]]
         [[res4u_branch2c/kernel/_1303]]
  (1) Failed precondition: Attempting to use uninitialized value bn4m_branch2a/moving_mean
         [[{{node bn4m_branch2a/moving_mean}}]]
0 successful operations.
0 derived errors ignored.

Full crash log:

Using TensorFlow backend.
Exporting
WARNING: Logging before flag parsing goes to stderr.
W0821 15:38:28.174986 10628 deprecation_wrapper.py:119] From .\dostuff.py:465: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

2019-08-21 15:38:28.193225: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library nvcuda.dll
2019-08-21 15:38:28.301326: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties:
name: GeForce GTX 1660 Ti major: 7 minor: 5 memoryClockRate(GHz): 1.8
pciBusID: 0000:01:00.0
2019-08-21 15:38:28.304784: I tensorflow/stream_executor/platform/default/dlopen_checker_stub.cc:25] GPU libraries are statically linked, skip dlopen check.
2019-08-21 15:38:28.307348: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1763] Adding visible gpu devices: 0
2019-08-21 15:38:28.309567: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX AVX2
2019-08-21 15:38:28.316871: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties:
name: GeForce GTX 1660 Ti major: 7 minor: 5 memoryClockRate(GHz): 1.8
pciBusID: 0000:01:00.0
2019-08-21 15:38:28.320217: I tensorflow/stream_executor/platform/default/dlopen_checker_stub.cc:25] GPU libraries are statically linked, skip dlopen check.
2019-08-21 15:38:28.326736: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1763] Adding visible gpu devices: 0
2019-08-21 15:38:28.962304: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1181] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-08-21 15:38:28.965539: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1187]      0
2019-08-21 15:38:28.967078: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1200] 0:   N
2019-08-21 15:38:28.969073: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1326] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 4637 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1660 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5)
W0821 15:38:28.977636 10628 deprecation_wrapper.py:119] From C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0821 15:38:28.981634 10628 deprecation_wrapper.py:119] From C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0821 15:38:28.987633 10628 deprecation_wrapper.py:119] From C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0821 15:38:29.027620 10628 deprecation_wrapper.py:119] From C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:1919: The name tf.nn.fused_batch_norm is deprecated. Please use tf.compat.v1.nn.fused_batch_norm instead.

W0821 15:38:29.039617 10628 deprecation_wrapper.py:119] From C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.

W0821 15:38:31.993680 10628 deprecation_wrapper.py:119] From C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:2018: The name tf.image.resize_nearest_neighbor is deprecated. Please use tf.compat.v1.image.resize_nearest_neighbor instead.

W0821 15:38:32.483054 10628 deprecation_wrapper.py:119] From c:\users\oixx\src\parkshark\dl\mask_rcnn-master\mrcnn\model.py:341: The name tf.log is deprecated. Please use tf.math.log instead.

W0821 15:38:32.496038 10628 deprecation.py:323] From c:\users\oixx\src\parkshark\dl\mask_rcnn-master\mrcnn\model.py:399: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0821 15:38:32.505053 10628 deprecation.py:506] From c:\users\oixx\src\parkshark\dl\mask_rcnn-master\mrcnn\model.py:423: calling crop_and_resize_v1 (from tensorflow.python.ops.image_ops_impl) with box_ind is deprecated and will be removed in a future version.
Instructions for updating:
box_ind is deprecated, use box_indices instead
W0821 15:38:32.784962 10628 deprecation_wrapper.py:119] From c:\users\oixx\src\parkshark\dl\mask_rcnn-master\mrcnn\model.py:720: The name tf.sets.set_intersection is deprecated. Please use tf.sets.intersection instead.

W0821 15:38:32.900920 10628 deprecation.py:323] From c:\users\oixx\src\parkshark\dl\mask_rcnn-master\mrcnn\model.py:772: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.
2019-08-21 15:38:37.857340: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties:
name: GeForce GTX 1660 Ti major: 7 minor: 5 memoryClockRate(GHz): 1.8
pciBusID: 0000:01:00.0
2019-08-21 15:38:37.864144: I tensorflow/stream_executor/platform/default/dlopen_checker_stub.cc:25] GPU libraries are statically linked, skip dlopen check.
2019-08-21 15:38:37.872470: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1763] Adding visible gpu devices: 0
2019-08-21 15:38:37.874155: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1181] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-08-21 15:38:37.876349: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1187]      0
2019-08-21 15:38:37.877834: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1200] 0:   N
2019-08-21 15:38:37.885836: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1326] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 4637 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1660 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5)
W0821 15:38:38.957580 10628 deprecation.py:323] From .\dostuff.py:416: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
W0821 15:38:38.961585 10628 deprecation.py:323] From C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\framework\graph_util_impl.py:270: extract_sub_graph (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
Traceback (most recent call last):
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\client\session.py", line 1356, in _do_call
    return fn(*args)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\client\session.py", line 1341, in _run_fn
    options, feed_dict, fetch_list, target_list, run_metadata)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\client\session.py", line 1429, in _call_tf_sessionrun
    run_metadata)
tensorflow.python.framework.errors_impl.FailedPreconditionError: 2 root error(s) found.
  (0) Failed precondition: Attempting to use uninitialized value bn4m_branch2a/moving_mean
         [[{{node bn4m_branch2a/moving_mean}}]]
         [[res4u_branch2c/kernel/_1303]]
  (1) Failed precondition: Attempting to use uninitialized value bn4m_branch2a/moving_mean
         [[{{node bn4m_branch2a/moving_mean}}]]
0 successful operations.
0 derived errors ignored.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File ".\dostuff.py", line 489, in <module>
    export()
  File ".\dostuff.py", line 487, in export
    production_ready(model_path, frozen_name, model_save_dir, version_number)
  File ".\dostuff.py", line 473, in production_ready
    freeze_model(model.keras_model, frozen_name, model_save_dir)
  File ".\dostuff.py", line 422, in freeze_model
    frozen_graph = freeze_session(tf.Session(), output_names=[out.op.name for out in model.outputs][:4])
  File ".\dostuff.py", line 416, in freeze_session
    output_names, freeze_var_names)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\util\deprecation.py", line 324, in new_func
    return func(*args, **kwargs)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\framework\graph_util_impl.py", line 307, in convert_variables_to_constants
    returned_variables = sess.run(variable_names)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\client\session.py", line 950, in run
    run_metadata_ptr)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\client\session.py", line 1173, in _run
    feed_dict_tensor, options, run_metadata)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\client\session.py", line 1350, in _do_run
    run_metadata)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\client\session.py", line 1370, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.FailedPreconditionError: 2 root error(s) found.
  (0) Failed precondition: Attempting to use uninitialized value bn4m_branch2a/moving_mean
         [[node bn4m_branch2a/moving_mean (defined at C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:402) ]]
         [[res4u_branch2c/kernel/_1303]]
  (1) Failed precondition: Attempting to use uninitialized value bn4m_branch2a/moving_mean
         [[node bn4m_branch2a/moving_mean (defined at C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:402) ]]
0 successful operations.
0 derived errors ignored.

Original stack trace for 'bn4m_branch2a/moving_mean':
  File ".\dostuff.py", line 489, in <module>
    export()
  File ".\dostuff.py", line 487, in export
    production_ready(model_path, frozen_name, model_save_dir, version_number)
  File ".\dostuff.py", line 470, in production_ready
    model = MaskRCNN(mode="inference", model_dir=model_path, config=config)
  File "c:\users\oixx\src\parkshark\dl\mask_rcnn-master\mrcnn\model.py", line 1837, in __init__
    self.keras_model = self.build(mode=mode, config=config)
  File "c:\users\oixx\src\parkshark\dl\mask_rcnn-master\mrcnn\model.py", line 1901, in build
    stage5=True, train_bn=config.TRAIN_BN)
  File "c:\users\oixx\src\parkshark\dl\mask_rcnn-master\mrcnn\model.py", line 197, in resnet_graph
    x = identity_block(x, 3, [256, 256, 1024], stage=4, block=chr(98 + i), train_bn=train_bn)
  File "c:\users\oixx\src\parkshark\dl\mask_rcnn-master\mrcnn\model.py", line 113, in identity_block
    x = BatchNorm(name=bn_name_base + '2a')(x, training=train_bn)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\keras\engine\base_layer.py", line 431, in __call__
    self.build(unpack_singleton(input_shapes))
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\keras\layers\normalization.py", line 124, in build
    trainable=False)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\keras\legacy\interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\keras\engine\base_layer.py", line 252, in add_weight
    constraint=constraint)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\keras\backend\tensorflow_backend.py", line 402, in variable
    v = tf.Variable(value, dtype=tf.as_dtype(dtype), name=name)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\ops\variables.py", line 259, in __call__
    return cls._variable_v1_call(*args, **kwargs)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\ops\variables.py", line 220, in _variable_v1_call
    shape=shape)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\ops\variables.py", line 198, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2511, in default_variable_creator
    shape=shape)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\ops\variables.py", line 263, in __call__
    return super(VariableMetaclass, cls).__call__(*args, **kwargs)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\ops\variables.py", line 1568, in __init__
    shape=shape)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\ops\variables.py", line 1728, in _init_from_args
    name=name)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\ops\state_ops.py", line 79, in variable_op_v2
    shared_name=shared_name)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\ops\gen_state_ops.py", line 1609, in variable_v2
    shared_name=shared_name, name=name)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 788, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\util\deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\framework\ops.py", line 3616, in create_op
    op_def=op_def)
  File "C:\Users\oixx\.conda\envs\keras-gpu\lib\site-packages\tensorflow\python\framework\ops.py", line 2005, in __init__
    self._traceback = tf_stack.extract_stack()

little-ron commented 4 years ago

I found a slightly different solution, using the following:

def freeze_model(model, name, model_save_dir):
    frozen_graph = freeze_session(sess, output_names=[out.op.name for out in model.outputs][:4])
    tf.train.write_graph(frozen_graph, model_save_dir, name , as_text=False)
    print("*"*80)
    print("Finish converting keras model to Frozen PB")
    print("*" * 80)

def make_serving_ready(model_save_dir, save_serve_path, version_number):
    import tensorflow as tf

    export_dir = os.path.join(save_serve_path, str(version_number))
    graph_pb = model_save_dir

    builder = tf.saved_model.builder.SavedModelBuilder(export_dir)

    with tf.gfile.GFile(graph_pb, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())

    sigs = {}

    with tf.Session(graph=tf.Graph()) as sess:
        tf.import_graph_def(graph_def, name="")
        g = tf.get_default_graph()
        input_image = g.get_tensor_by_name("input_image:0")
        input_image_meta = g.get_tensor_by_name("input_image_meta:0")
        input_anchors = g.get_tensor_by_name("input_anchors:0")

        output_detection = g.get_tensor_by_name("mrcnn_detection/Reshape_1:0")
        output_mask = g.get_tensor_by_name("mrcnn_mask/Reshape_1:0")

        sigs[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = tf.saved_model.signature_def_utils.predict_signature_def(
                {"input_image": input_image, 'input_image_meta': input_image_meta, 'input_anchors': input_anchors},
                {"mrcnn_detection/Reshape_1": output_detection, 'mrcnn_mask/Reshape_1': output_mask})

        builder.add_meta_graph_and_variables(sess, [tag_constants.SERVING], signature_def_map=sigs)

    builder.save()
    print("*" * 80)
    print("FINISH CONVERTING FROZEN PB TO SERVING READY")
    print("*" * 80)

def production_ready(model_path:str, frozen_name:str, model_save_dir:str, version_number:int):
    sess = tf.Session()
    K.set_session(sess)
    config = AssetConfig()

    # LOAD MODEL
    model = modellib.MaskRCNN(mode="inference", model_dir=model_path, config=config)
    model.load_weights(model_path, by_name=True, exclude=["mrcnn_class_logits", "mrcnn_bbox_fc", "mrcnn_bbox", "mrcnn_mask"])
    # Converting keras model to PB frozen graph
    freeze_model(model.keras_model, frozen_name, model_save_dir)

    # Now convert frozen graph to Tensorflow Serving Ready
    make_serving_ready(os.path.join(model_save_dir, frozen_name), model_save_dir, version_number)

    print("Frozen Model saved here: ", model_save_dir)
    print("Serving Model saved here: ", model_save_dir)

and can be run with something along the lines of:

model_path = "D:\\Path\\To\\Training\\Folder\\assetsDATETTIME\\mask_rcnn_assets_EPOCHNUM.h5"
frozen_name = "export_model_name.pb"
model_save_dir = "D:\\Path\\To\\Freezing\\Folder"
version_number = 1

production_ready(model_path, frozen_name, model_save_dir, version_number)

This saves it out ready to be used in the Tensorflow Serving format.

Where are AssetConfig() and freeze_session() from?

matterport / Mask_RCNN

It is possible save all model and recover training? #1538