markovmodel / deeptime

Deep learning meets molecular dynamics.
GNU Lesser General Public License v3.0
173 stars 39 forks source link

vamp._loss_VAMP_sym error #20

Closed euhruska closed 5 years ago

euhruska commented 6 years ago

with vamp._loss_VAMP_sym in line

hist = model.fit_generator(generator = vamp_data_loader.build_generator_on_source(train_data_source,
                                                      batch_size,
                                                      tau,
                                                      output_size),
                           steps_per_epoch = steps_per_train_epoch,
                           epochs = nb_epoch,
                           verbose = 0,
                           validation_data = vamp_data_loader.build_generator_on_source(valid_data_source,
                                                            batch_size,
                                                            tau,
                                                            output_size),
                           validation_steps = steps_per_valid_epoch,
                           shuffle = True
                          )

I get an fatal error:

---------------------------------------------------------------------------
InvalidArgumentError                      Traceback (most recent call last)
<ipython-input-54-480bb4a81ec5> in <module>()
     28                                                                     output_size),
     29                                    validation_steps = steps_per_valid_epoch,
---> 30                                    shuffle = True
     31                                   )
     32 

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
     89                 warnings.warn('Update your `' + object_name +
     90                               '` call to the Keras 2 API: ' + signature, stacklevel=2)
---> 91             return func(*args, **kwargs)
     92         wrapper._original_function = func
     93         return wrapper

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
   1413             use_multiprocessing=use_multiprocessing,
   1414             shuffle=shuffle,
-> 1415             initial_epoch=initial_epoch)
   1416 
   1417     @interfaces.legacy_generator_methods_support

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/engine/training_generator.py in fit_generator(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
    228                             val_enqueuer_gen,
    229                             validation_steps,
--> 230                             workers=0)
    231                     else:
    232                         # No need for try/except because

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
     89                 warnings.warn('Update your `' + object_name +
     90                               '` call to the Keras 2 API: ' + signature, stacklevel=2)
---> 91             return func(*args, **kwargs)
     92         wrapper._original_function = func
     93         return wrapper

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/engine/training.py in evaluate_generator(self, generator, steps, max_queue_size, workers, use_multiprocessing, verbose)
   1467             workers=workers,
   1468             use_multiprocessing=use_multiprocessing,
-> 1469             verbose=verbose)
   1470 
   1471     @interfaces.legacy_generator_methods_support

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/engine/training_generator.py in evaluate_generator(model, generator, steps, max_queue_size, workers, use_multiprocessing, verbose)
    341                                  'or (x, y). Found: ' +
    342                                  str(generator_output))
--> 343             outs = model.test_on_batch(x, y, sample_weight=sample_weight)
    344             outs = to_list(outs)
    345             outs_per_batch.append(outs)

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/engine/training.py in test_on_batch(self, x, y, sample_weight)
   1252             ins = x + y + sample_weights
   1253         self._make_test_function()
-> 1254         outputs = self.test_function(ins)
   1255         return unpack_singleton(outputs)
   1256 

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py in __call__(self, inputs)
   2664                 return self._legacy_call(inputs)
   2665 
-> 2666             return self._call(inputs)
   2667         else:
   2668             if py_any(is_tensor(x) for x in inputs):

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py in _call(self, inputs)
   2634                                 symbol_vals,
   2635                                 session)
-> 2636         fetched = self._callable_fn(*array_vals)
   2637         return fetched[:len(self.outputs)]
   2638 

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/tensorflow/python/client/session.py in __call__(self, *args, **kwargs)
   1380           ret = tf_session.TF_SessionRunCallable(
   1381               self._session._session, self._handle, args, status,
-> 1382               run_metadata_ptr)
   1383         if run_metadata:
   1384           proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py in __exit__(self, type_arg, value_arg, traceback_arg)
    517             None, None,
    518             compat.as_text(c_api.TF_Message(self.status.status)),
--> 519             c_api.TF_GetCode(self.status.status))
    520     # Delete the underlying status object from memory otherwise it stays alive
    521     # as there is a reference to status from this from the traceback due to

InvalidArgumentError: Got info = 2 for batch index 0, expected info = 0. Debug_info = heevd
   [[Node: metrics_4/metric_VAMP/SelfAdjointEigV2 = SelfAdjointEigV2[T=DT_FLOAT, compute_v=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](loss_4/concatenate_1_loss/mul_3)]]
   [[Node: loss_4/mul/_603 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_450_loss_4/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
pasqualil commented 6 years ago

I don't get this error, which tensorflow version are you using? This was an issue appearing sometimes on tf 1.7-1.8

euhruska commented 6 years ago

tf version 1.10.0

amardt commented 6 years ago

For us this is working with this version of tensorflow. We are unable to reproduce this. Are you working on a GPU? (Perhaps also the cuda version plays a role...) Can you write us your Cuda version and which Cudnn you are using if it applies? Thx Andreas

clonker commented 6 years ago

It might also be that it failed to compute the SVD for the first batch so there could be a problem with your data. The documentation of heevd (which is a eigenvalue/eigenvector solver) says:

if INFO = i and JOBZ = 'N', then the algorithm failed
                to converge; i off-diagonal elements of an intermediate
                tridiagonal form did not converge to zero;
if INFO = i and JOBZ = 'V', then the algorithm failed
                to compute an eigenvalue while working on the submatrix
                lying in rows and columns INFO/(N+1) through
                mod(INFO,N+1).