lvapeab / nmt-keras

Neural Machine Translation with Keras
http://nmt-keras.readthedocs.io
MIT License
533 stars 130 forks source link

Using Tensorboard with multi_gpu gives Error #96

Closed KushalDave closed 5 years ago

KushalDave commented 5 years ago

I put up a model that builds on the TranslationModel that you have written. The belowcode stops working if I uncomment the following line:

callbacks.append(callback_tensorboard)

I looked at this: https://github.com/keras-team/keras/issues/6988 "I've come across this issue and it was occurring because my combined models were cascaded, thus the Tensorboard callback has a hard time finding given layers (you might be able to give it a proper input like "model2/layer3" but my attempts at that have failed, I'm quite new at this)." Do you have a solution for this?

``if name == 'main': import string import re import numpy as np import tensorflow as tf import cloudpickle as cloudpk import os from DataGeneratorAttention import DataGeneratorAttention from DataGenerator import DataGenerator from keras.callbacks import ModelCheckpoint from keras.utils import Sequence from keras.models import Sequential from keras.layers import LSTM from keras.layers import Dense from keras.layers import Embedding from keras.layers import RepeatVector from keras.layers import TimeDistributed

from custom_recurrents import AttentionDecoder

from keras.callbacks import Callback
from keras.utils import multi_gpu_model
from keras.layers.core import*
import time
from config import load_parameters
from nmt_keras.model_zoo import TranslationModel
import logging
from keras_wrapper.extra.regularize import Regularize
from keras.callbacks import EarlyStopping, TensorBoard
from keras_wrapper.extra.callbacks import *

def saveModel(model_wrapper, update_num, path=None, full_path=False, store_iter=False):
    """
    Saves a backup of the current Model_Wrapper object after being trained for 'update_num' iterations/updates/epochs.

    :param model_wrapper: object to save
    :param update_num: identifier of the number of iterations/updates/epochs elapsed
    :param path: path where the model will be saved
    :param full_path: Whether we save to the path of from path + '/epoch_' + update_num
    :param store_iter: Whether we store the current update_num
    :return: None
    """
    if not path:
        path = '/Model'

    iteration = str(update_num)

    if full_path:
        if store_iter:
            model_name = path + '_' + iteration
        else:
            model_name = path
    else:
        if store_iter:
            model_name = path + '/update_' + iteration
        else:
            model_name = path + '/epoch_' + iteration

    if not model_wrapper.silence:
        logging.info("<<< Saving model to " + model_name + " ... >>>")

    # Create models dir
    if not os.path.isdir(path):
        if not os.path.isdir(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))

    try:  # Try to save model at one time
        model_wrapper.model.save(model_name + '.h5')
    except Exception as e:  # Split saving in model structure / weights
        logging.info(str(e))
        # Save model structure
        json_string = model_wrapper.model.to_json()
        open(model_name + '_structure.json', 'w').write(json_string)
        # Save model weights
        model_wrapper.model.save_weights(model_name + '_weights.h5', overwrite=True)

    # Save auxiliary models for optimized search
    if model_wrapper.model_init is not None:
        try:  # Try to save model at one time
            model_wrapper.model_init.save(model_name + '_init.h5')
        except Exception as e:  # Split saving in model structure / weights
            logging.info(str(e))
            # Save model structure
            logging.info("<<< Saving model_init to " + model_name + "_structure_init.json... >>>")
            json_string = model_wrapper.model_init.to_json()
            open(model_name + '_structure_init.json', 'w').write(json_string)
            # Save model weights
            model_wrapper.model_init.save_weights(model_name + '_weights_init.h5', overwrite=True)

    if model_wrapper.model_next is not None:
        try:  # Try to save model at one time
            model_wrapper.model_next.save(model_name + '_next.h5')
        except Exception as e:  # Split saving in model structure / weights
            logging.info(str(e))
            # Save model structure
            logging.info("<<< Saving model_next to " + model_name + "_structure_next.json... >>>")
            json_string = model_wrapper.model_next.to_json()
            open(model_name + '_structure_next.json', 'w').write(json_string)
            # Save model weights
            model_wrapper.model_next.save_weights(model_name + '_weights_next.h5', overwrite=True)

    # Save additional information
    backup_multi_gpu_model = None
    if hasattr(model_wrapper, 'multi_gpu_model'):
        backup_multi_gpu_model = model_wrapper.multi_gpu_model
        setattr(model_wrapper, 'multi_gpu_model', None)

    cloudpk.dump(model_wrapper, open(model_name + '_Model_Wrapper.pkl', 'wb'))
    setattr(model_wrapper, 'multi_gpu_model', backup_multi_gpu_model)

    if not model_wrapper.silence:
        logging.info("<<< Model saved >>>")

def load_file(fn, batch_count, xlength,ylength, batch_size):
    i=0
    X = np.zeros(shape=(batch_count*batch_size,xlength), dtype=np.uint16)
    XMask = np.zeros(shape=(batch_count*batch_size,xlength), dtype=np.uint8)
    Y = np.zeros(shape=(batch_count*batch_size,ylength), dtype=np.uint16)
    YMask = np.zeros(shape=(batch_count*batch_size,ylength), dtype=np.uint8)
    with open(fn) as f:
        for line in f:
            couple=line.split('\t')
            if(len(couple) != 2):
                continue
            tempx = np.fromstring(couple[0], dtype=np.uint16, sep=" ")
            X[i,0:len(tempx)] = tempx
            XMask[i, 0:len(tempx)]=np.where(tempx > 0, 1, 0)
            tempy = np.fromstring(couple[1], dtype=np.uint16, sep=" ")
            Y[i,0:len(tempy)] = tempy
            YMask[i, 0:len(tempy)]=np.where(tempy > 0, 1, 0)
            i+=1
    f.close()
    Xr = (np.asarray(X, dtype=np.uint16), np.asarray(XMask, dtype='int8'))
    Yr = (np.asarray(Y, dtype=np.uint16), np.asarray(YMask, dtype='int8'))
    return Xr, Yr

class TimeHistory(Callback):
    def on_batch_begin(self, batch, logs={}):
        self.epoch_time_start = time.time()

    def on_batch_end(self, batch, logs={}):
        print('training for batch ' + str(batch) + ' in '+ str( time.time() - self.epoch_time_start) + 'secs')
        self.model

params = {
        'src_max_length':9,
        'src_vocab_size':65532,
        'tar_max_length':9,
        'tar_vocab_size':65532}

parameters = load_parameters()

parameters['INPUT_VOCABULARY_SIZE'] = params['src_vocab_size']
parameters['OUTPUT_VOCABULARY_SIZE'] = params['tar_vocab_size']
parameters['MAX_INPUT_TEXT_LEN'] = params['src_max_length']
parameters['MAX_OUTPUT_TEXT_LEN'] = params['tar_max_length']
parameters['SOURCE_TEXT_EMBEDDING_SIZE'] = 64
parameters['TARGET_TEXT_EMBEDDING_SIZE'] = 64
trainfn = "TrainIds.txt"
testfn = "TestIds.txt"

train_batch_count = 4687
test_batch_count = 1640 
batch_size=256
TrainX, TrainY = load_file(trainfn, train_batch_count, params['src_max_length'], params['tar_max_length'], batch_size)
TestX, TestY = load_file(testfn, test_batch_count, params['src_max_length'], params['tar_max_length'], batch_size)
print((TrainX[0]).shape)

print((TestX[0]).shape)
time_callback = TimeHistory()

set_optimizer = True if parameters['RELOAD'] == 0 else False
clear_dirs = True if parameters['RELOAD'] == 0 else False
training_generator = DataGeneratorAttention(TrainX,TrainY, train_batch_count,batch_size, **params)
validation_generator = DataGeneratorAttention(TestX, TestY, test_batch_count,batch_size, **params)

# build new model
nmt_model = TranslationModel(parameters,
                             model_type=parameters['MODEL_TYPE'],
                             verbose=parameters['VERBOSE'],
                             model_name=parameters['MODEL_NAME'],
                             store_path=parameters['STORE_PATH'],
                             set_optimizer=set_optimizer,
                             clear_dirs=clear_dirs)

logging.debug('Starting training!')
training_params = {'n_epochs': parameters['MAX_EPOCH'],
                   'batch_size': parameters['BATCH_SIZE'],
                   'homogeneous_batches': parameters['HOMOGENEOUS_BATCHES'],
                   'maxlen': parameters['MAX_OUTPUT_TEXT_LEN'],
                   'joint_batches': parameters['JOINT_BATCHES'],
                   'lr_decay': parameters.get('LR_DECAY', None),  # LR decay parameters
                   'initial_lr': parameters.get('LR', 1.0),
                   'reduce_each_epochs': parameters.get('LR_REDUCE_EACH_EPOCHS', True),
                   'start_reduction_on_epoch': parameters.get('LR_START_REDUCTION_ON_EPOCH', 0),
                   'lr_gamma': parameters.get('LR_GAMMA', 0.9),
                   'lr_reducer_type': parameters.get('LR_REDUCER_TYPE', 'linear'),
                   'lr_reducer_exp_base': parameters.get('LR_REDUCER_EXP_BASE', 0),
                   'lr_half_life': parameters.get('LR_HALF_LIFE', 50000),
                   'lr_warmup_exp': parameters.get('WARMUP_EXP', -1.5),
                   'epochs_for_save': parameters['EPOCHS_FOR_SAVE'],
                   'verbose': parameters['VERBOSE'],
                   'eval_on_sets': parameters['EVAL_ON_SETS_KERAS'],
                   'n_parallel_loaders': parameters['PARALLEL_LOADERS'],
                   #'extra_callbacks': callbacks,
                   'reload_epoch': parameters['RELOAD'],
                   'epoch_offset': parameters.get('EPOCH_OFFSET', 0),
                   'data_augmentation': parameters['DATA_AUGMENTATION'],
                   'patience': parameters.get('PATIENCE', 0),  # early stopping parameters
                   'patience_check_split': 'val',
                   'metric_check': parameters.get('STOP_METRIC', None) if parameters.get('EARLY_STOP', False) else None,
                   'eval_on_epochs': parameters.get('EVAL_EACH_EPOCHS', True),
                   'each_n_epochs': parameters.get('EVAL_EACH', 1),
                   'start_eval_on_epoch': parameters.get('START_EVAL_ON_EPOCH', 0),
                   'tensorboard': parameters.get('TENSORBOARD', False),
                   'n_gpus': parameters.get('N_GPUS', 1),
                   'tensorboard_params': {'log_dir': parameters.get('LOG_DIR', 'tensorboard_logs'),
                                          'histogram_freq': parameters.get('HISTOGRAM_FREQ', 0),
                                          'batch_size': parameters.get('TENSORBOARD_BATCH_SIZE', parameters['BATCH_SIZE']),
                                          'write_graph': parameters.get('WRITE_GRAPH', True),
                                          'write_grads': parameters.get('WRITE_GRADS', False),
                                          'write_images': parameters.get('WRITE_IMAGES', False),
                                          'embeddings_freq': parameters.get('EMBEDDINGS_FREQ', 0),
                                          'embeddings_layer_names': parameters.get('EMBEDDINGS_LAYER_NAMES', None),
                                          'embeddings_metadata': parameters.get('EMBEDDINGS_METADATA', None),
                                          'label_word_embeddings_with_vocab': parameters.get(
                                              'LABEL_WORD_EMBEDDINGS_WITH_VOCAB', False),
                                          'word_embeddings_labels': parameters.get('WORD_EMBEDDINGS_LABELS', None),
                                          }
                   }

callbacks = []
if training_params.get('metric_check') is not None:
    callback_early_stop = EarlyStopping(nmt_model,patience=training_params['patience'],
                                        metric_check=training_params['metric_check'],
                                        want_to_minimize=True if 'TER' in training_params['metric_check'] else False,
                                        check_split=training_params['patience_check_split'],
                                        eval_on_epochs=training_params['eval_on_epochs'],
                                        each_n_epochs=training_params['each_n_epochs'],
                                        start_eval_on_epoch=training_params['start_eval_on_epoch'])
    callbacks.append(callback_early_stop)

if training_params['epochs_for_save'] >= 0:
    callback_store_model = StoreModelWeightsOnEpochEnd(nmt_model, saveModel, training_params['epochs_for_save'])
    callbacks.insert(0, callback_store_model)
callbacks.append(time_callback);

if training_params['tensorboard'] and K.backend() == 'tensorflow':
    callback_tensorboard = TensorBoard(
            log_dir= 'Data/' + training_params['tensorboard_params']['log_dir'],
            histogram_freq=training_params['tensorboard_params']['histogram_freq'],
            batch_size=training_params['tensorboard_params']['batch_size'],
            write_graph=training_params['tensorboard_params']['write_graph'],
            write_grads=training_params['tensorboard_params']['write_grads'],
            write_images=training_params['tensorboard_params']['write_images'],
            embeddings_freq=training_params['tensorboard_params']['embeddings_freq'],
            embeddings_layer_names=training_params['tensorboard_params']['embeddings_layer_names'],
            embeddings_metadata=training_params['tensorboard_params']['embeddings_metadata'])
    #callbacks.append(callback_tensorboard) ###### TENSORBOARD DOES NOT SEEM TO WORK WITH MULTI_GPU

mymod = multi_gpu_model(nmt_model.model)

mymod.compile(optimizer='adam', loss='categorical_crossentropy')
filename = 'modelfull_attention.h5'
checkpoint = ModelCheckpoint(filename, monitor='loss', verbose=1, save_best_only=True, mode='min')
mymod.fit_generator(generator=training_generator, workers=4, verbose=2, validation_data=validation_generator, epochs=30, use_multiprocessing=True, callbacks=callbacks)``
KushalDave commented 5 years ago

@lvapeab Here is the stack trace, if I uncomment the line:

Traceback (most recent call last): File "NMTKeras\TrainGen_Attention.py", line 351, in mymod.fit_generator(generator=training_generator, workers=4, verbose=2, validation_data=validation_generator, epochs=30, use_multiprocessing=True, callbacks=callbacks) File "venv\Scripts\keras\keras\legacy\interfaces.py", line 91, in wrapper return func(*args, **kwargs) File "venv\Scripts\keras\keras\engine\training.py", line 1470, in fit_generator initial_epoch=initial_epoch) File "venv\Scripts\keras\keras\engine\training_generator.py", line 95, in fit_generator callbacks.set_model(callback_model) File "venv\Scripts\keras\keras\callbacks.py", line 53, in set_model callback.set_model(model) File "venv\Scripts\keras\keras\callbacks.py", line 889, in set_model self.saver = tf.train.Saver(list(embeddings_vars.values())) File "venv\lib\site-packages\tensorflow\python\training\saver.py", line 1338, in init self.build() File "venv\lib\site-packages\tensorflow\python\training\saver.py", line 1347, in build self._build(self._filename, build_save=True, build_restore=True) File "venv\lib\site-packages\tensorflow\python\training\saver.py", line 1372, in _build raise ValueError("No variables to save") ValueError: No variables to save

lvapeab commented 5 years ago

Hi @KushalDave ,

It seemed that with multi_gpu_models, Keras generates two Savers for Tensorboard. I've just uploaded a (dirty) fix to that (https://github.com/MarcBS/keras/commit/382e69c6eb732cb13b69bd7c408fdfc4248c67fc, https://github.com/lvapeab/multimodal_keras_wrapper/commit/14547a179f0ff4a99285f4807c1ae13d23c88b5c). I've tested it and seems to work properly. You should update the Keras and Multimodal Keras Wrapper repositories.

Cheers.

lvapeab commented 5 years ago

Hi again,

we've decide to temporarily disable the embeddings data options in the Tensorboard callback and rely on the one from the original Keras repo. I therefore close this issue. Cheers.