tensorflow / tensor2tensor

Library of deep learning models and datasets designed to make deep learning more accessible and accelerate ML research.
Apache License 2.0
15.5k stars 3.49k forks source link

transformer bfloat16 error #932

Open eyaler opened 6 years ago

eyaler commented 6 years ago

Traceback (most recent call last): File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 510, in _apply_op_helper preferred_dtype=default_dtype) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1107, in internal_convert_to_tensor ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 944, in _TensorTensorConversionFunction (dtype.name, t.dtype.name, str(t))) ValueError: Tensor conversion requested dtype bfloat16 for Tensor with dtype float32: 'Tensor("transformer/parallel_0_5/transformer/transformer/body/add_timing_signal_1d/get_timing_signal_1d/Reshape:0", shape=(1, 64, 512), dtype=float32)'

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "/usr/local/bin/t2t-trainer", line 32, in tf.app.run() File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/platform/app.py", line 125, in run _sys.exit(main(argv)) File "/usr/local/bin/t2t-trainer", line 28, in main t2t_trainer.main(argv) File "/usr/local/lib/python3.5/dist-packages/tensor2tensor/bin/t2t_trainer.py", line 358, in main execute_schedule(exp) File "/usr/local/lib/python3.5/dist-packages/tensor2tensor/bin/t2t_trainer.py", line 306, in execute_schedule getattr(exp, FLAGS.schedule)() File "/usr/local/lib/python3.5/dist-packages/tensor2tensor/utils/trainer_lib.py", line 289, in continuous_train_and_eval self._eval_spec) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/training.py", line 447, in train_and_evaluate return executor.run() File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/training.py", line 531, in run return self.run_local() File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/training.py", line 669, in run_local hooks=train_hooks) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 366, in train loss = self._train_model(input_fn, hooks, saving_listeners) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 1119, in _train_model return self._train_model_default(input_fn, hooks, saving_listeners) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 1132, in _train_model_default features, labels, model_fn_lib.ModeKeys.TRAIN, self.config) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1992, in _call_model_fn features, labels, mode, config) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 1107, in _call_model_fn model_fn_results = self._model_fn(features=features, kwargs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2223, in _model_fn _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn)) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2537, in _train_on_tpu_system device_assignment=ctx.device_assignment) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 733, in shard name=name) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 394, in replicate device_assignment, name)[1] File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 546, in split_compile_and_replicate outputs = computation(computation_inputs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2530, in multi_tpu_train_steps_on_single_shard single_tpu_train_step, [_INITIAL_LOSS]) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 207, in repeat cond, body_wrapper, inputs=inputs, infeed_queue=infeed_queue, name=name) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 169, in while_loop name="") File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 3209, in while_loop result = loop_context.BuildLoop(cond, body, loop_vars, shape_invariants) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2941, in BuildLoop pred, body, original_loop_vars, loop_vars, shape_invariants) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2878, in _BuildLoop body_result = body(packed_vars_for_body) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 120, in body_wrapper outputs = body((inputs + dequeue_ops)) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 203, in body_wrapper return [i + 1] + _convert_to_list(body(args)) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1166, in train_step self._call_model_fn(features, labels)) File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1337, in _call_model_fn estimator_spec = self._model_fn(features=features, kwargs) File "/usr/local/lib/python3.5/dist-packages/tensor2tensor/utils/t2t_model.py", line 1155, in wrapping_model_fn use_tpu=use_tpu) File "/usr/local/lib/python3.5/dist-packages/tensor2tensor/utils/t2t_model.py", line 1206, in estimator_model_fn logits, losses_dict = model(features) # pylint: disable=not-callable File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/base.py", line 329, in call outputs = super(Layer, self).call(inputs, *args, kwargs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 703, in call outputs = self.call(inputs, *args, *kwargs) File "/usr/local/lib/python3.5/dist-packages/tensor2tensor/utils/t2t_model.py", line 176, in call sharded_logits, losses = self.model_fn_sharded(sharded_features) File "/usr/local/lib/python3.5/dist-packages/tensor2tensor/utils/t2t_model.py", line 231, in model_fn_sharded sharded_logits, sharded_losses = dp(self.model_fn, datashard_to_features) File "/usr/local/lib/python3.5/dist-packages/tensor2tensor/utils/expert_utils.py", line 231, in call outputs.append(fns[i](my_args[i], my_kwargs[i])) File "/usr/local/lib/python3.5/dist-packages/tensor2tensor/utils/t2t_model.py", line 265, in model_fn body_out = self.body(transformed_features) File "/usr/local/lib/python3.5/dist-packages/tensor2tensor/models/transformer.py", line 167, in body inputs, target_space, hparams, features=features, losses=losses) File "/usr/local/lib/python3.5/dist-packages/tensor2tensor/models/transformer.py", line 75, in encode inputs, target_space, hparams, features=features)) File "/usr/local/lib/python3.5/dist-packages/tensor2tensor/models/transformer.py", line 1085, in transformer_prepare_encoder encoder_input = common_attention.add_timing_signal_1d(encoder_input) File "/usr/local/lib/python3.5/dist-packages/tensor2tensor/utils/expert_utils.py", line 58, in decorated return f(*args, **kwargs) File "/usr/local/lib/python3.5/dist-packages/tensor2tensor/layers/common_attention.py", line 473, in add_timing_signal_1d return x + signal File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/math_ops.py", line 847, in binary_op_wrapper return func(x, y, name=name) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_math_ops.py", line 297, in add "Add", x=x, y=y, name=name) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 546, in _apply_op_helper inferred_from[input_arg.type_attr])) TypeError: Input 'y' of 'Add' Op has type float32 that does not match type bfloat16 of argument 'x'.

MarvinLong commented 5 years ago

I got the same error, and I use the problem with _rev ending for the target2source problem.