Rayhane-mamah / Tacotron-2

DeepMind's Tacotron-2 Tensorflow implementation
MIT License
2.27k stars 905 forks source link

ExponentialMovingAverage not found in checkpoint #163

Open fire-python opened 6 years ago

fire-python commented 6 years ago

Hi, does anyone have the exception throwed, like the one described in the following stack:

Constructing model: WaveNet Initializing Wavenet model. Dimensions (? = dynamic shape): Train mode: False Eval mode: False Synthesis mode: True local_condition: (1, 80, ?) outputs: (?,) Loading checkpoint: logs-WaveNet/wave_pretrained/wavenet_model.ckpt-5000 Traceback (most recent call last): File "/usr/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1322, in _do_call return fn(*args) File "/usr/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1307, in _run_fn options, feed_dict, fetch_list, target_list, run_metadata) File "/usr/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1409, in _call_tf_sessionrun run_metadata) tensorflow.python.framework.errors_impl.NotFoundError: Key model_1/model/ResidualConv1dGLU_0/residual_block_cin_conv/bias_residual_block_cin_conv/ExponentialMovingAverage not found in

checkpoint [[Node: model_1/save/RestoreV2 = RestoreV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, ..., DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT],

_device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_model_1/save/Const_0_0, model_1/save/RestoreV2/tensor_names, model_1/save/RestoreV2/shape_and_slices)]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "synthesize.py", line 91, in main() File "synthesize.py", line 85, in main synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences) File "synthesize.py", line 34, in synthesize wavenet_synthesize(args, hparams, wave_checkpoint) File "/root/Tacotron-2/wavenet_vocoder/synthesize.py", line 78, in wavenet_synthesize run_synthesis(args, checkpoint_path, output_dir, hparams) File "/root/Tacotron-2/wavenet_vocoder/synthesize.py", line 19, in run_synthesis synth.load(checkpoint_path, hparams) File "/root/Tacotron-2/wavenet_vocoder/synthesizer.py", line 32, in load load_averaged_model(self.session, sh_saver, checkpoint_path) File "/root/Tacotron-2/wavenet_vocoder/train.py", line 56, in load_averaged_model sh_saver.restore(sess, checkpoint_path) File "/usr/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 1802, in restore {self.saver_def.filename_tensor_name: save_path}) File "/usr/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 900, in run run_metadata_ptr) File "/usr/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1135, in _run feed_dict_tensor, options, run_metadata) File "/usr/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1316, in _do_run run_metadata) File "/usr/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1335, in _do_call raise type(e)(node_def, op, message) tensorflow.python.framework.errors_impl.NotFoundError: Key model_1/model/ResidualConv1dGLU_0/residual_block_cin_conv/bias_residual_block_cin_conv/ExponentialMovingAverage not found in

checkpoint [[Node: model_1/save/RestoreV2 = RestoreV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, ..., DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT],

_device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_model_1/save/Const_0_0, model_1/save/RestoreV2/tensor_names, model_1/save/RestoreV2/shape_and_slices)]]

Caused by op 'model_1/save/RestoreV2', defined at: File "synthesize.py", line 91, in main() File "synthesize.py", line 85, in main synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences) File "synthesize.py", line 34, in synthesize wavenet_synthesize(args, hparams, wave_checkpoint) File "/root/Tacotron-2/wavenet_vocoder/synthesize.py", line 78, in wavenet_synthesize run_synthesis(args, checkpoint_path, output_dir, hparams) File "/root/Tacotron-2/wavenet_vocoder/synthesize.py", line 19, in run_synthesis synth.load(checkpoint_path, hparams) File "/root/Tacotron-2/wavenet_vocoder/synthesizer.py", line 27, in load sh_saver = create_shadow_saver(self.model) File "/root/Tacotron-2/wavenet_vocoder/train.py", line 53, in create_shadow_saver return tf.train.Saver(shadow_dict, max_to_keep=5) File "/usr/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 1338, in init self.build() File "/usr/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 1347, in build self._build(self._filename, build_save=True, build_restore=True) File "/usr/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 1384, in _build build_save=build_save, build_restore=build_restore) File "/usr/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 835, in _build_internal restore_sequentially, reshape) File "/usr/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 472, in _AddRestoreOps restore_sequentially) File "/usr/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 886, in bulk_restore return io_ops.restore_v2(filename_tensor, names, slices, dtypes) File "/usr/lib/python3.4/site-packages/tensorflow/python/ops/gen_io_ops.py", line 1463, in restore_v2 shape_and_slices=shape_and_slices, dtypes=dtypes, name=name) File "/usr/lib/python3.4/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper op_def=op_def) File "/usr/lib/python3.4/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op op_def=op_def) File "/usr/lib/python3.4/site-packages/tensorflow/python/framework/ops.py", line 1718, in init self._traceback = self._graph._extract_stack() # pylint: disable=protected-access

NotFoundError (see above for traceback): Key model_1/model/ResidualConv1dGLU_0/residual_block_cin_conv/bias_residual_block_cin_conv/ExponentialMovingAverage not found in checkpoint [[Node: model_1/save/RestoreV2 = RestoreV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, ..., DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT],

_device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_model_1/save/Const_0_0, model_1/save/RestoreV2/tensor_names, model_1/save/RestoreV2/shape_and_slices)]]

Rayhane-mamah commented 6 years ago

Hello @fire-python,

"model_1" bugs usually happen when both loading wavenet while another model is still in GPU memory, could you please start by verifying that you are using the latest version. Then please describe exact steps to reproduce your error. (I might have left some use cases where this bug happens)

Thanks :)

fire-python commented 6 years ago

@Rayhane-mamah, I'm using an intermediate model, as you can see, this is the step-5000 wavenet model checkpoint. I just invoke the command "python3 synthesize.py --model=Both --wavenet_name=WaveNet", I suppose may be you'r right, it's because the training is still going. I'll wait the training finished to see. Thanks

fire-python commented 6 years ago

@Rayhane-mamah , to have a quick look at the wavenet stage trainging result, I reduced the training steps to 10000, but the crash seems the same as above. Any idea why? see the stack below:

Constructing model: WaveNet Initializing Wavenet model. Dimensions (? = dynamic shape): Train mode: False Eval mode: False Synthesis mode: True local_condition: (1, 80, ?) outputs: (?,) Loading checkpoint: logs-WaveNet/wave_pretrained/wavenet_model.ckpt-10000 Traceback (most recent call last): File "/usr/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1322, in _do_call return fn(*args) File "/usr/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1307, in _run_fn options, feed_dict, fetch_list, target_list, run_metadata) File "/usr/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1409, in _call_tf_sessionrun run_metadata) tensorflow.python.framework.errors_impl.NotFoundError: Key model_1/model/ResidualConv1dGLU_0/residual_block_cin_conv/bias_residual_block_cin_conv/ExponentialMovingAverage not found in checkpoint [[Node: model_1/save/RestoreV2 = RestoreV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, ..., DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_model_1/save/Const_0_0, model_1/save/RestoreV2/tensor_names, model_1/save/RestoreV2/shape_and_slices)]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "synthesize.py", line 91, in main() File "synthesize.py", line 85, in main synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences) File "synthesize.py", line 34, in synthesize wavenet_synthesize(args, hparams, wave_checkpoint) File "/root/Tacotron-2/wavenet_vocoder/synthesize.py", line 78, in wavenet_synthesize run_synthesis(args, checkpoint_path, output_dir, hparams) File "/root/Tacotron-2/wavenet_vocoder/synthesize.py", line 19, in run_synthesis synth.load(checkpoint_path, hparams) File "/root/Tacotron-2/wavenet_vocoder/synthesizer.py", line 32, in load load_averaged_model(self.session, sh_saver, checkpoint_path) File "/root/Tacotron-2/wavenet_vocoder/train.py", line 56, in load_averaged_model sh_saver.restore(sess, checkpoint_path) File "/usr/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 1802, in restore {self.saver_def.filename_tensor_name: save_path}) File "/usr/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 900, in run run_metadata_ptr) File "/usr/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1135, in _run feed_dict_tensor, options, run_metadata) File "/usr/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1316, in _do_run run_metadata) File "/usr/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1335, in _do_call raise type(e)(node_def, op, message) tensorflow.python.framework.errors_impl.NotFoundError: Key model_1/model/ResidualConv1dGLU_0/residual_block_cin_conv/bias_residual_block_cin_conv/ExponentialMovingAverage not found in checkpoint [[Node: model_1/save/RestoreV2 = RestoreV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, ..., DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_model_1/save/Const_0_0, model_1/save/RestoreV2/tensor_names, model_1/save/RestoreV2/shape_and_slices)]]

Caused by op 'model_1/save/RestoreV2', defined at: File "synthesize.py", line 91, in main() File "synthesize.py", line 85, in main synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences) File "synthesize.py", line 34, in synthesize wavenet_synthesize(args, hparams, wave_checkpoint) File "/root/Tacotron-2/wavenet_vocoder/synthesize.py", line 78, in wavenet_synthesize run_synthesis(args, checkpoint_path, output_dir, hparams) File "/root/Tacotron-2/wavenet_vocoder/synthesize.py", line 19, in run_synthesis synth.load(checkpoint_path, hparams) File "/root/Tacotron-2/wavenet_vocoder/synthesizer.py", line 27, in load sh_saver = create_shadow_saver(self.model) File "/root/Tacotron-2/wavenet_vocoder/train.py", line 53, in create_shadow_saver return tf.train.Saver(shadow_dict, max_to_keep=5) File "/usr/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 1338, in init self.build() File "/usr/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 1347, in build self._build(self._filename, build_save=True, build_restore=True) File "/usr/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 1384, in _build build_save=build_save, build_restore=build_restore) File "/usr/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 835, in _build_internal restore_sequentially, reshape) File "/usr/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 472, in _AddRestoreOps restore_sequentially) File "/usr/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 886, in bulk_restore return io_ops.restore_v2(filename_tensor, names, slices, dtypes) File "/usr/lib/python3.4/site-packages/tensorflow/python/ops/gen_io_ops.py", line 1463, in restore_v2 shape_and_slices=shape_and_slices, dtypes=dtypes, name=name) File "/usr/lib/python3.4/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper op_def=op_def) File "/usr/lib/python3.4/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op op_def=op_def) File "/usr/lib/python3.4/site-packages/tensorflow/python/framework/ops.py", line 1718, in init self._traceback = self._graph._extract_stack() # pylint: disable=protected-access

NotFoundError (see above for traceback): Key model_1/model/ResidualConv1dGLU_0/residual_block_cin_conv/bias_residual_block_cin_conv/ExponentialMovingAverage not found in checkpoint [[Node: model_1/save/RestoreV2 = RestoreV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, ..., DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_model_1/save/Const_0_0, model_1/save/RestoreV2/tensor_names, model_1/save/RestoreV2/shape_and_slices)]]

Rayhane-mamah commented 6 years ago

All right, in that case: you said in a previous comment you use --model=Both which is no longer supported in our current code (it was causing multiple problems for people).

Thus I believe your code isn't up to update, specifically these lines: https://github.com/Rayhane-mamah/Tacotron-2/blob/d13dbba16f0a434843916b5a8647a42fe34544f5/synthesize.py#L32-L40

I assume you're missing the graph reset part (to remove Tacotron graph from memory after synthesis with it, prior to loading wavenet into memory). That should solve it. (both Tacotron and Wavenet share the "model" upper scope, that's the source of the problem.

fire-python commented 6 years ago

@Rayhane-mamah, sorry to have to bother you again, I'm new and fresh to tensorflow and wavenet. Now I update your code to head, but I met a new crash report, could you please give a look? Thanks

Initializing Wavenet model. Dimensions (? = dynamic shape): Train mode: True Eval mode: False Synthesis mode: False Traceback (most recent call last): File "/usr/lib/python3.4/site-packages/tensorflow/python/framework/ops.py", line 1567, in _create_c_op c_op = c_api.TF_FinishOperation(op_desc) tensorflow.python.framework.errors_impl.InvalidArgumentError: Can not squeeze dim[1], expected a dimension of 1, got 300 for 'model_1/inference/Squeeze' (op: 'Squeeze') with input shapes: [?,300,1,?].

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "train.py", line 133, in main() File "train.py", line 127, in main train(args, log_dir, hparams) File "train.py", line 80, in train checkpoint = wavenet_train(args, log_dir, hparams, input_path) File "/root/Tacotron-2/wavenet_vocoder/train.py", line 251, in wavenet_train return train(log_dir, args, hparams, input_path) File "/root/Tacotron-2/wavenet_vocoder/train.py", line 175, in train model, stats = model_train_mode(args, feeder, hparams, global_step) File "/root/Tacotron-2/wavenet_vocoder/train.py", line 123, in model_train_mode feeder.input_lengths, x=feeder.inputs) File "/root/Tacotron-2/wavenet_vocoder/models/wavenet.py", line 178, in initialize y_hat = self.step(x, c, g, softmax=False) #softmax is automatically computed inside softmax_cross_entropy if needed File "/root/Tacotron-2/wavenet_vocoder/models/wavenet.py", line 475, in step c = tf.squeeze(c, [1]) File "/usr/lib/python3.4/site-packages/tensorflow/python/ops/array_ops.py", line 2630, in squeeze return gen_array_ops.squeeze(input, axis, name) File "/usr/lib/python3.4/site-packages/tensorflow/python/ops/gen_array_ops.py", line 7862, in squeeze "Squeeze", input=input, squeeze_dims=axis, name=name) File "/usr/lib/python3.4/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper op_def=op_def) File "/usr/lib/python3.4/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op op_def=op_def) File "/usr/lib/python3.4/site-packages/tensorflow/python/framework/ops.py", line 1734, in init control_input_ops) File "/usr/lib/python3.4/site-packages/tensorflow/python/framework/ops.py", line 1570, in _create_c_op raise ValueError(str(e)) ValueError: Can not squeeze dim[1], expected a dimension of 1, got 300 for 'model_1/inference/Squeeze' (op: 'Squeeze') with input shapes: [?,300,1,?].