pierluigiferrari / ssd_keras

A Keras port of Single Shot MultiBox Detector
Apache License 2.0
1.86k stars 934 forks source link

InvalidArgumentError when compiling model with ssd_loss #372

Closed jessicametzger closed 3 years ago

jessicametzger commented 3 years ago

I am trying to train the ssd300 model but it throws an error when I compile it with the ssd_loss function. I am calling functions I wrote, which construct the model, load the weights, and compile it, from a jupyter notebook. All model parameters have stayed the same.

System information:

Here is the full stack trace:

---------------------------------------------------------------------------
InvalidArgumentError                      Traceback (most recent call last)
<ipython-input-1-0d76259d0f8c> in <module>
     13 
     14 # create model with this weights path
---> 15 test(weights_path)

~/track_label_tool/scripts/test_model_compilation.py in test(weights_path)
     64     adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
     65     ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0)
---> 66     model.compile(optimizer=adam, loss=ssd_loss.compute_loss)
     67 

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py in compile(self, optimizer, loss, metrics, loss_weights, sample_weight_mode, weighted_metrics, **kwargs)
    327 
    328       self.optimizer = self._get_optimizer(optimizer)
--> 329       self.compiled_loss = compile_utils.LossesContainer(
    330           loss, loss_weights, output_names=self.output_names)
    331       self.compiled_metrics = compile_utils.MetricsContainer(

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py in __init__(self, losses, loss_weights, output_names)
    118     self._loss_weights = loss_weights
    119     self._per_output_metrics = None  # Per-output losses become metrics.
--> 120     self._loss_metric = metrics_mod.Mean(name='loss')  # Total loss.
    121     self._built = False
    122 

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/keras/metrics.py in __init__(self, name, dtype)
    468       dtype: (Optional) data type of the metric result.
    469     """
--> 470     super(Mean, self).__init__(
    471         reduction=metrics_utils.Reduction.WEIGHTED_MEAN, name=name, dtype=dtype)
    472 

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/keras/metrics.py in __init__(self, reduction, name, dtype)
    305     self.reduction = reduction
    306     with ops.init_scope():
--> 307       self.total = self.add_weight(
    308           'total', initializer=init_ops.zeros_initializer)
    309       if reduction in [metrics_utils.Reduction.SUM_OVER_BATCH_SIZE,

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/keras/metrics.py in add_weight(self, name, shape, aggregation, synchronization, initializer, dtype)
    274       synchronization = tf_variables.VariableSynchronization.ON_WRITE
    275 
--> 276     return super(Metric, self).add_weight(
    277         name=name,
    278         shape=shape,

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py in add_weight(self, name, shape, dtype, initializer, regularizer, trainable, constraint, partitioner, use_resource, synchronization, aggregation, **kwargs)
    558         caching_device = None
    559 
--> 560     variable = self._add_variable_with_custom_getter(
    561         name=name,
    562         shape=shape,

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/training/tracking/base.py in _add_variable_with_custom_getter(self, name, shape, dtype, initializer, getter, overwrite, **kwargs_for_getter)
    736         initializer = checkpoint_initializer
    737         shape = None
--> 738     new_variable = getter(
    739         name=name,
    740         shape=shape,

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer_utils.py in make_variable(name, shape, dtype, initializer, trainable, caching_device, validate_shape, constraint, use_resource, collections, synchronization, aggregation, partitioner)
    127   # can remove the V1.
    128   variable_shape = tensor_shape.TensorShape(shape)
--> 129   return tf_variables.VariableV1(
    130       initial_value=init_val,
    131       name=name,

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/ops/variables.py in __call__(cls, *args, **kwargs)
    257   def __call__(cls, *args, **kwargs):
    258     if cls is VariableV1:
--> 259       return cls._variable_v1_call(*args, **kwargs)
    260     elif cls is Variable:
    261       return cls._variable_v2_call(*args, **kwargs)

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/ops/variables.py in _variable_v1_call(cls, initial_value, trainable, collections, validate_shape, caching_device, name, variable_def, dtype, expected_shape, import_scope, constraint, use_resource, synchronization, aggregation, shape)
    203     if aggregation is None:
    204       aggregation = VariableAggregation.NONE
--> 205     return previous_getter(
    206         initial_value=initial_value,
    207         trainable=trainable,

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/ops/variables.py in getter(**kwargs)
     64 
     65   def getter(**kwargs):
---> 66     return captured_getter(captured_previous, **kwargs)
     67 
     68   return getter

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py in creator(next_creator, **kwargs)
   2560     def creator(next_creator, **kwargs):
   2561       _require_strategy_scope_strategy(strategy)
-> 2562       return next_creator(**kwargs)
   2563 
   2564     self._var_creator_scope = variable_scope.variable_creator_scope(creator)

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/ops/variables.py in <lambda>(**kwargs)
    196                         shape=None):
    197     """Call on Variable class. Useful to force the signature."""
--> 198     previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
    199     for _, getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
    200       previous_getter = _make_getter(getter, previous_getter)

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/ops/variable_scope.py in default_variable_creator(next_creator, **kwargs)
   2582   if use_resource:
   2583     distribute_strategy = kwargs.get("distribute_strategy", None)
-> 2584     return resource_variable_ops.ResourceVariable(
   2585         initial_value=initial_value,
   2586         trainable=trainable,

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/ops/variables.py in __call__(cls, *args, **kwargs)
    261       return cls._variable_v2_call(*args, **kwargs)
    262     else:
--> 263       return super(VariableMetaclass, cls).__call__(*args, **kwargs)
    264 
    265 

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/ops/resource_variable_ops.py in __init__(self, initial_value, trainable, collections, validate_shape, caching_device, name, dtype, variable_def, import_scope, constraint, distribute_strategy, synchronization, aggregation, shape)
   1421       self._init_from_proto(variable_def, import_scope=import_scope)
   1422     else:
-> 1423       self._init_from_args(
   1424           initial_value=initial_value,
   1425           trainable=trainable,

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/ops/resource_variable_ops.py in _init_from_args(self, initial_value, trainable, collections, caching_device, name, dtype, constraint, synchronization, aggregation, distribute_strategy, shape)
   1575           else:
   1576             shape = initial_value.shape
-> 1577           handle = eager_safe_variable_handle(
   1578               initial_value=initial_value,
   1579               shape=shape,

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/ops/resource_variable_ops.py in eager_safe_variable_handle(initial_value, shape, shared_name, name, graph_mode)
    240   """
    241   dtype = initial_value.dtype.base_dtype
--> 242   return _variable_handle_from_shape_and_dtype(
    243       shape, dtype, shared_name, name, graph_mode, initial_value)
    244 

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/ops/resource_variable_ops.py in _variable_handle_from_shape_and_dtype(shape, dtype, shared_name, name, graph_mode, initial_value)
    172     # compatible with ASYNC execution mode. Further, since not all devices
    173     # support string tensors, we encode the assertion string in the Op name
--> 174     gen_logging_ops._assert(  # pylint: disable=protected-access
    175         math_ops.logical_not(exists), [exists], name="EagerVariableNameReuse")
    176 

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/ops/gen_logging_ops.py in _assert(condition, data, summarize, name)
     53         pass  # Add nodes to the TensorFlow graph.
     54     except _core._NotOkStatusException as e:
---> 55       _ops.raise_from_not_ok_status(e, name)
     56   # Add nodes to the TensorFlow graph.
     57   if summarize is None:

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/tensorflow/python/framework/ops.py in raise_from_not_ok_status(e, name)
   6651   message = e.message + (" name: " + name if name is not None else "")
   6652   # pylint: disable=protected-access
-> 6653   six.raise_from(core._status_to_exception(e.code, message), None)
   6654   # pylint: enable=protected-access
   6655 

~/anaconda3/envs/tlt3/lib/python3.8/site-packages/six.py in raise_from(value, from_value)

InvalidArgumentError: assertion failed: [0] [Op:Assert] name: EagerVariableNameReuse

where the file test_model_compilation.py can be found here: test_model_compilation.zip. Running the code from a jupyter notebook should reproduce this error.

Strangely, running it from a python file throws a different error first. When running tf.device('CPU:0') cuda throws a memory error. That's not ssd_keras specific so I won't include that but I can't say whether the above error is thrown outside jupyter notebooks because of this.

I am at a loss for why this is happening and any help would be greatly appreciated. Thanks.

jessicametzger commented 3 years ago

Actually, I was able to avoid both this issue and the GPU memory issue by setting

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

to change to CPU, and "0" to change to GPU, instead of using tf.device(). I still have no idea why one works and not the other for something like compiling the model, but this works just as well for me.