CRF layer loss with multi_gpu_model

louis925 commented 5 years ago

I am trying to use CRF layer with multiple GPU training via

from keras.utils import multi_gpu_model
from keras_contrib.losses import  crf_loss
from keras_contrib.metrics import crf_viterbi_accuracy

# The last layer of model is CRF
parallel_model = multi_gpu_model(model, gpus=2)
parallel_model.compile(loss=crf_loss, optimizer='adam', metrics=[crf_viterbi_accuracy])

but I got error when compiling it:

/usr/local/lib/python3.5/dist-packages/keras_contrib-2.0.8-py3.5.egg/keras_contri
b/losses/crf_losses.py in crf_loss(y_true, y_pred)
     53     """
     54     crf, idx = y_pred._keras_history[:2]
---> 55     if crf.learn_mode == 'join':
     56         return crf_nll(y_true, y_pred)
     57     else:

AttributeError: 'Concatenate' object has no attribute 'learn_mode'

I guess this is because multi_gpu_model makes a new models which concatenate from multiple GPUs output. Is there a way to fix this?

gabrieldemarmiesse commented 5 years ago

Using the private variable _keras_history is a ugly hack and it's not suprising that it doesn't work here. We need to find a solution for crf because we can't keep this layer/loss here in the long run in this current form.

helpmefindaname commented 5 years ago

I recreated the loss function for joined mode:

def create_joined_crf_loss(crf):
    def loss(y_true, y_pred):
        offset = 0

        X = crf.input
        mask = crf.input_mask
        nloglik = crf.get_negative_log_likelihood(y_true, X, mask)
        return nloglik

    return loss

You can use it like this:

crf = CRF(units, learn_mode="join")
out = crf(x)
# model = Model(..., outputs = [out])
parallel_model = multi_gpu_model(model, gpus=2)
parallel_model.compile(loss=create_joined_crf_loss(crf), optimizer='adam', metrics=[crf_viterbi_accuracy])

vForce825 commented 5 years ago

I recreated the loss function for joined mode:

    def loss(y_true, y_pred):
        offset = 0

        X = crf.input
        mask = crf.input_mask
        nloglik = crf.get_negative_log_likelihood(y_true, X, mask)
        return nloglik

    return loss

You can use it like this:

crf = CRF(units, learn_mode="join")
out = crf(x)
# model = Model(..., outputs = [out])
parallel_model = multi_gpu_model(model, gpus=2)
parallel_model.compile(loss=create_joined_crf_loss(crf), optimizer='adam', metrics=[crf_viterbi_accuracy])

It seems the loss function has two return lines and the usage example doesn't contain this function. Is it a mistake?

helpmefindaname commented 5 years ago

@vForce825 for some reason the first line was disguised, I hope it makes more sense now

maochen commented 5 years ago

@helpmefindaname thanks for the code, however I run into another error after using the snippet.

 File "/usr/local/lib/python3.6/dist-packages/keras_contrib/metrics/crf_accuracies.py", line 24, in crf_viterbi_accuracy
    y_pred = crf.viterbi_decoding(X, mask)
AttributeError: 'Concatenate' object has no attribute 'viterbi_decoding'

helpmefindaname commented 5 years ago

Oh right, you can either leave out the metrics=[crf_viterbi_accuracy] part or create another function like the first one.

from keras_contrib.metrics.crf_accuracies import _get_accuracy
def create_joined_crf_accuracy(crf):
    def accuracy(y_true, y_pred):        
        X = crf.input
        mask = crf.input_mask
        y_pred = crf.viterbi_decoding(X, mask)
        return _get_accuracy(y_true, y_pred, mask, crf.sparse_target)

    return accuracy

Vicky-Meng commented 5 years ago

Oh right, you can either leave out the metrics=[crf_viterbi_accuracy] part or create another function like the first one.

from keras_contrib.metrics.crf_accuracies import _get_accuracy
def create_joined_crf_accuracy(crf):
    def accuracy(y_true, y_pred):        
        X = crf.input
        mask = crf.input_mask
        y_pred = crf.viterbi_decoding(X, mask)
        return _get_accuracy(y_true, y_pred, mask, crf.sparse_target)

    return accuracy

hi, I implented your loss function and accuracy function, but got another error (training on multi GPUs):

Traceback (most recent call last): File "main_jarvis_bigru_crf.py", line 141, in callbacks=callbacks) File "/usr/local/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 91, in wrapper return func(*args, *kwargs) File "/usr/local/lib/python3.6/site-packages/keras/engine/training.py", line 1418, in fit_generator initial_epoch=initial_epoch) File "/usr/local/lib/python3.6/site-packages/keras/engine/training_generator.py", line 217, in fit_generator class_weight=class_weight) File "/usr/local/lib/python3.6/site-packages/keras/engine/training.py", line 1217, in train_on_batch outputs = self.train_function(ins) File "/usr/local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2715, in call return self._call(inputs) File "/usr/local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2675, in _call fetched = self._callable_fn(array_vals) File "/usr/local/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1439, in call run_metadata_ptr) File "/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 528, in exit c_api.TF_GetCode(self.status.status)) tensorflow.python.framework.errors_impl.InvalidArgumentError: Matrix size-incompatible: In[0]: [18816,1], In[1]: [5,5] [[{{node loss/crf_1_loss/MatMul_1}} = MatMul[T=DT_FLOAT, _class=["loc:@training/Adam/gradients/loss/crf_1_loss/MatMul_1_grad/MatMul_1"], transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](loss/crf_1_loss/Reshape_3, crf_1/chain_kernel/read)]] [[{{node metrics/accuracy/while_1/Exit_2/_231}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_2948_metrics/accuracy/while_1/Exit_2", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

could use some help. Thanks a loooot.

helpmefindaname commented 5 years ago

Hi, could you provide a minimal code sample that produces this error?

pinesnow72 commented 4 years ago

@helpmefindaname Your suggesting code works but seems not to use multi gpus. I have 2 gpus and checked gpu usage during training with your code. Only the first gpu is working and the second one stays idle. If I replaced the CRF layer with Dense layer, then both GPUs are working, and if I set the CRF's learn_mode to 'marginal' then both GPUs are working too. But if I set the CRF's learn_mode to 'join' then only the first GPU is working. What would be the problem? Any idea on how to solve it?

The following are the modified codes for loss and accuracy computing. I slightly changed @helpmefindaname's code to make it work in case of single gpu.

from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_accuracy, crf_viterbi_accuracy, crf_marginal_accuracy
from keras_contrib.metrics.crf_accuracies import _get_accuracy
from keras.losses import categorical_crossentropy, sparse_categorical_crossentropy

def crf_nll_multi(y_true, y_pred):
    concat, idx = y_pred._keras_history[:2]
    crf = concat._inbound_nodes[idx].inbound_layers[0]._output_layers[0]
    if crf._outbound_nodes:
        raise TypeError('When learn_model="join", CRF must be the last layer.')
    if crf.sparse_target:
        y_true = K.one_hot(K.cast(y_true[:, :, 0], 'int32'), crf.units)
    X = crf._inbound_nodes[0].input_tensors[0]
    mask = crf._inbound_nodes[0].input_masks[0]
    nloglik = crf.get_negative_log_likelihood(y_true, X, mask)
    return nloglik

def crf_loss_multi(y_true, y_pred):
    concat, idx = y_pred._keras_history[:2]
    if isinstance(concat, CRF):
        return crf_loss(y_true, y_pred)
    crf = concat._inbound_nodes[idx].inbound_layers[0]._output_layers[0]
    if crf.learn_mode == 'join':
        return crf_nll_multi(y_true, y_pred)
    else:
        if crf.sparse_target:
            return sparse_categorical_crossentropy(y_true, y_pred)
        else:
            return categorical_crossentropy(y_true, y_pred)

def crf_viterbi_accuracy_multi(y_true, y_pred):
    concat, idx = y_pred._keras_history[:2]
    if isinstance(concat, CRF):
        return crf_viterbi_accuracy(y_true, y_pred)
    crf = concat._inbound_nodes[idx].inbound_layers[0]._output_layers[0]
    X = crf._inbound_nodes[0].input_tensors[0]
    mask = crf._inbound_nodes[0].input_masks[0]
    y_pred = crf.viterbi_decoding(X, mask)
    return _get_accuracy(y_true, y_pred, mask, crf.sparse_target)

def crf_marginal_accuracy_multi(y_true, y_pred):
    concat, idx = y_pred._keras_history[:2]
    if isinstance(concat, CRF):
        return crf_marginal_accuracy(y_true, y_pred)
    crf = concat._inbound_nodes[idx].inbound_layers[0]._output_layers[0]
    X = crf._inbound_nodes[0].input_tensors[0]
    mask = crf._inbound_nodes[0].input_masks[0]
    y_pred = crf.get_marginal_prob(X, mask)
    return _get_accuracy(y_true, y_pred, mask, crf.sparse_target)

def crf_accuracy_multi(y_true, y_pred):
    concat, idx = y_pred._keras_history[:2]
    if isinstance(concat, CRF):
        return crf_accuracy(y_true, y_pred)
    crf = concat._inbound_nodes[idx].inbound_layers[0]._output_layers[0]
    if crf.test_mode == 'viterbi':
        return crf_viterbi_accuracy_multi(y_true, y_pred)
    else:
        return crf_marginal_accuracy_multi(y_true, y_pred)

keras-team / keras-contrib

CRF layer loss with multi_gpu_model #453