lvapeab / nmt-keras

Neural Machine Translation with Keras
http://nmt-keras.readthedocs.io
MIT License
532 stars 130 forks source link

Getting error when using Tensorboard #128

Closed NamTran838P closed 4 years ago

NamTran838P commented 4 years ago

I am trying to train a seq2seq-attention model using 2 GPUs and would like to use Tensorboard for visualization. I followed the nmt-keras tutorial for Tensorboard but still got an error. Here is the code that I used:

def start_training(use_gpu):

    ds = Dataset('tutorial_dataset', 'tutorial', silence=False)
    PATH = ""
    ds.setOutput(PATH + "train_correct.txt",
                 'train',
                 type='text',
                 id='target_text',
                 tokenization='tokenize_basic',
                 build_vocabulary=True,
                 pad_on_batch=True,
                 sample_weights=True,
                 max_text_len=100,
                 max_words=50000,
                 min_occ=1)

    ds.setOutput(PATH + "validation_correct.txt",
                 'val',
                 type='text',
                 id='target_text',
                 pad_on_batch=True,
                 tokenization='tokenize_basic',
                 sample_weights=True,
                 max_text_len=100,
                 max_words=0)

    ds.setInput(PATH + "train_error.txt",
                'train',
                type='text',
                id='source_text',
                pad_on_batch=True,
                tokenization='tokenize_basic',
                build_vocabulary=True,
                fill='end',
                max_text_len=100,
                max_words=50000,
                min_occ=1)

    ds.setInput(PATH + "validation_error.txt",
                'val',
                type='text',
                id='source_text',
                pad_on_batch=True,
                tokenization='tokenize_basic',
                fill='end',
                max_text_len=100,
                min_occ=1)

    """...and for the 'state_below' data. Note that: 1) The offset flat is set to 1, which means that the text will be shifted to the right 1 position. 2) During sampling time, we won't have this input. Hence, we 'hack' the dataset model by inserting an artificial input, of type 'ghost' for the validation split."""

    ds.setInput(PATH + "train_correct.txt",
                'train',
                type='text',
                id='state_below',
                required=False,
                tokenization='tokenize_basic',
                pad_on_batch=True,
                build_vocabulary='target_text',
                offset=1,
                fill='end',
                max_text_len=100,
                max_words=50000)
    ds.setInput(None,
                'val',
                type='ghost',
                id='state_below',
                required=False)

    """We can also keep the literal source words (for replacing unknown words)."""

    for split, input_text_filename in zip(['train', 'val'], [PATH + "train_error.txt", PATH + "validation_error.txt"]):
        ds.setRawInput(input_text_filename,
                      split,
                      type='file-name',
                      id='raw_source_text',
                      overwrite_split=True)

    """We also need to match the references with the inputs. Since we only have one reference per input sample, we set `repeat=1`."""

    keep_n_captions(ds, repeat=1, n=1, set_names=['val'])

    """Finally, we can save our dataset instance for using in other experiments:"""

    saveDataset(ds, PATH + "dataset")

    """## 2. Creating and training a Neural Translation Model
    Now, we'll create and train a Neural Machine Translation (NMT) model. Since there is a significant number of hyperparameters, we'll use the default ones, specified in the `config.py` file. Note that almost every hardcoded parameter is automatically set from config if we run  `main.py `.

    We'll create an `'AttentionRNNEncoderDecoder'` (a LSTM encoder-decoder with attention mechanism). Refer to the [`model_zoo.py`](https://github.com/lvapeab/nmt-keras/blob/master/nmt_keras/model_zoo.py) file for other models (e.g. Transformer). 

    So first, let's import the model and the hyperparameters. We'll also load the dataset we stored in the previous section (not necessary as it is in memory, but as a demonstration):
    """

    params = load_parameters()
    dataset = loadDataset(PATH + "dataset/Dataset_tutorial_dataset.pkl")

    """Since the number of words in the dataset may be unknown beforehand, we must update the params information according to the dataset instance:"""

    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len['source_text']
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len['target_text']
    params['USE_CUDNN'] = use_gpu
    params['EARLY_STOP'] = True
    params['PATIENCE'] = 10
    params['SAVE_EACH_EVALUATION'] = True
    params['STORE_PATH'] = PATH + "model/"
    params['ATTENTION_MODE'] = "add"
    params['N_LAYERS_ENCODER'] = 2
    params['N_LAYERS_DECODER'] = 2
    params['SOURCE_TEXT_EMBEDDING_SIZE'] = 128
    params['TARGET_TEXT_EMBEDDING_SIZE'] = 128
    params['SKIP_VECTORS_HIDDEN_SIZE'] = 128
    params['ATTENTION_SIZE'] = 128
    params['ENCODER_HIDDEN_SIZE'] = 128
    params['DECODER_HIDDEN_SIZE'] = 128
    params['ENCODER_RNN_TYPE'] = "GRU"
    params['DECODER_RNN_TYPE'] = "ConditionalGRU"
    params['METRICS'] = ['sacrebleu']
    params['STOP_METRIC'] = 'sacrebleu'
    params['APPLY_DETOKENIZATION'] = True
    params['LENGTH_PENALTY'] = True
    params['LENGTH_NORM_FACTOR'] = 1.0
    params['TENSORBOARD'] = True
    params['LOG_DIR'] = 'tensorboard_logs'
    params['EMBEDDING_FREQ'] = 1
    params['WRITE_GRAPH'] = True
    params['WRITE_GRADS'] = True
    params['WRITE_IMAGES'] = True
    params['EMBEDDING_LAYER_NAMES'] = ["source_word_embedding", "target_word_embedding"]
    params['LABEL_WORD_EMBEDDINGS_WITH_VOCAB'] = True
    params['WORD_EMBEDDINGS_LABELS'] = ['source_text', 'target_text']
    nmt_model = TranslationModel(params,
                                model_type='AttentionRNNEncoderDecoder', 
                                 model_name='tutorial_model',
                                vocabularies=dataset.vocabulary,
                                store_path=params['STORE_PATH'],
                               verbose=True)

    inputMapping = dict()
    for i, id_in in enumerate(params['INPUTS_IDS_DATASET']):
        pos_source = dataset.ids_inputs.index(id_in)
        id_dest = nmt_model.ids_inputs[i]
        inputMapping[id_dest] = pos_source
    nmt_model.setInputsMapping(inputMapping)

    outputMapping = dict()
    for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']):
        pos_target = dataset.ids_outputs.index(id_out)
        id_dest = nmt_model.ids_outputs[i]
        outputMapping[id_dest] = pos_target
    nmt_model.setOutputsMapping(outputMapping)

    """We can add some callbacks for controlling the training (e.g. Sampling each N updates, early stop, learning rate annealing...). For instance, let's build a sampling callback. After each epoch, it will compute the BLEU scores on the development set using the sacreBLEU package. We need to pass some configuration variables to the callback (in the extra_vars dictionary):"""

    search_params = {
        'language': 'en',
        'tokenize_f': eval('dataset.' + 'tokenize_basic'),
        'beam_size': 1,
        'optimized_search': True,
        'n_gpus' : 2,
        'model_inputs': params['INPUTS_IDS_MODEL'],
        'model_outputs': params['OUTPUTS_IDS_MODEL'],
        'dataset_inputs':  params['INPUTS_IDS_DATASET'],
        'dataset_outputs':  params['OUTPUTS_IDS_DATASET'],
        'n_parallel_loaders': 1,
        'maxlen': 100,
        'model_inputs': ['source_text', 'state_below'],
        'model_outputs': ['target_text'],
        'dataset_inputs': ['source_text', 'state_below'],
        'dataset_outputs': ['target_text'],
        'normalize': True,
        'pos_unk': True,
        'heuristic': 0,
        'state_below_maxlen': 1,
        'val': {'references': dataset.extra_variables['val']['target_text']}
      }

    vocab = dataset.vocabulary['target_text']['idx2words']
    callbacks = []
    input_text_id = params['INPUTS_IDS_DATASET'][0]

    callbacks.append(PrintPerformanceMetricOnEpochEndOrEachNUpdates(nmt_model,
                                                                    dataset,
                                                                    gt_id='target_text',
                                                                    metric_name=['sacrebleu'],
                                                                    set_name=['val'],
                                                                    batch_size=256,
                                                                    each_n_epochs=1,
                                                                    extra_vars=search_params,
                                                                    reload_epoch=0,
                                                                    is_text=True,
                                                                    input_text_id=input_text_id,
                                                                    index2word_y=vocab,
                                                                    sampling_type='max_likelihood',
                                                                    beam_search=True,
                                                                    save_path=nmt_model.model_path,
                                                                    start_eval_on_epoch=0,
                                                                    write_samples=True,
                                                                    write_type='list',
                                                                    verbose=True))

    """Now we are ready to train. Let's set up some training parameters..."""

    training_params = {'n_epochs': 500,
                       'batch_size': 256,
                       'maxlen': 50,
                       'epochs_for_save': 1,
                       'verbose': 1,
                       'eval_on_sets': [], 
                       'n_parallel_loaders': 1,
                       'extra_callbacks': callbacks,
                       'reload_epoch': 0,
                       'epoch_offset': 0,
                       'n_gpus': 2,
                       'tensorboard': True,
                       'tensorboard_params': {'log_dir': 'tensorboard_logs', 'embeddings_freq': 1, 
                       'embeddings_metadata': None, 'word_embedding_labels': ['source_text', 'target_text'],
                       'label_word_embeddings_with_vocab': True,
                       'embeddings_layer_names': ["source_word_embedding", "target_word_embedding"],
                       'histogram_freq': 1, 'batch_size': 100, 'write_graph': True, 'write_grads': True, 'write_images': True}}

    nmt_model.trainNet(dataset, training_params)

Here is the full log of the error I got (notice that if I disable Tensorboard, everything trains perfectly):

Traceback (most recent call last):
  File "train_model.py", line 384, in <module>
    main()
  File "train_model.py", line 373, in main
    start_training(use_gpu)
  File "train_model.py", line 235, in start_training
    nmt_model.trainNet(dataset, training_params)
  File "/WAVE/users/unix/nvtran/.local/lib/python3.7/site-packages/keras_wrapper/cnn_model.py", line 923, in trainNet
    self.__train(ds, params)
  File "/WAVE/users/unix/nvtran/.local/lib/python3.7/site-packages/keras_wrapper/cnn_model.py", line 1040, in __train
    callback_tensorboard.set_model(self.model)
  File "/WAVE/users/unix/nvtran/keras/keras/callbacks/tensorboard_v1.py", line 199, in set_model
    layer.output)
  File "/WAVE/apps/eb/software/TensorFlow/1.14.0-fosscuda-2019a-Python-3.7.2/lib/python3.7/site-packages/tensorflow/python/summary/summary.py", line 179, in histogram
    tag=tag, values=values, name=scope)
  File "/WAVE/apps/eb/software/TensorFlow/1.14.0-fosscuda-2019a-Python-3.7.2/lib/python3.7/site-packages/tensorflow/python/ops/gen_logging_ops.py", line 329, in histogram_summary
    "HistogramSummary", tag=tag, values=values, name=name)
  File "/WAVE/apps/eb/software/TensorFlow/1.14.0-fosscuda-2019a-Python-3.7.2/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py", line 626, in _apply_op_helper
    param_name=input_name)
  File "/WAVE/apps/eb/software/TensorFlow/1.14.0-fosscuda-2019a-Python-3.7.2/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py", line 60, in _SatisfiesTypeConstraint
    ", ".join(dtypes.as_dtype(x).name for x in allowed_list)))
TypeError: Value passed to parameter 'values' has DataType bool not in list of allowed values: float32, float64, int32, uint8, int16, int8, int64, bfloat16, uint16, float16, uint32, uint64
lvapeab commented 4 years ago

This was due to an old Tensorboard callback. For the moment, it works if you set the following parameters:

https://github.com/lvapeab/nmt-keras/blob/5a29099098e47ff84579b5e8c28c7769c952918c/nmt_keras/training.py#L159-L172

You'll may need to update the dependencies.

NamTran838P commented 4 years ago

It works now. Thanks a lot.