Training is slow - Githubissues

Thank you for the open source code. In the process of using the open source dataset to train the model, I used 8 x 3090 for model training, and the batch size was adjusted to 16. But the overall training process is very slow, and the utilization of GPU is also very low. I am not very familiar with the Tensorflow framework training model. Are there any specific parameters that need to be adjusted? Do you have any suggestions? Thank you very much Below is the training log.
************************************
ExperimentConfig(task=OcrTaskConfig(init_checkpoint='', model=None, train_data=DataConfig(input_path='', tfds_name='', tfds_split='', global_batch_size=0, is_training=True, drop_remainder=True, shuffle_buffer_size=100, cache=False, cycle_length=None, block_length=1, deterministic=None, sharding=True, enable_tf_data_service=False, tf_data_service_address=None, tf_data_service_job_name=None, tfds_data_dir='', tfds_as_supervised=False, tfds_skip_decoding_feature='', enable_shared_tf_data_service_between_parallel_trainers=False, apply_tf_data_service_before_batching=False, trainer_id=None, seed=None, prefetch_buffer_size=None), validation_data=DataConfig(input_path='', tfds_name='', tfds_split='', global_batch_size=0, is_training=None, drop_remainder=True, shuffle_buffer_size=100, cache=False, cycle_length=None, block_length=1, deterministic=None, sharding=True, enable_tf_data_service=False, tf_data_service_address=None, tf_data_service_job_name=None, tfds_data_dir='', tfds_as_supervised=False, tfds_skip_decoding_feature='', enable_shared_tf_data_service_between_parallel_trainers=False, apply_tf_data_service_before_batching=False, trainer_id=None, seed=None, prefetch_buffer_size=None), name=None, differential_privacy_config=None, model_call_needs_labels=False), trainer=TrainerConfig(optimizer_config=OptimizationConfig(optimizer=OptimizerConfig(type='adamw', sgd=SGDConfig(clipnorm=None, clipvalue=None, global_clipnorm=None, name='SGD', decay=0.0, nesterov=False, momentum=0.0), sgd_experimental=SGDExperimentalConfig(clipnorm=None, clipvalue=None, global_clipnorm=None, name='SGD', nesterov=False, momentum=0.0, jit_compile=False), adam=AdamConfig(clipnorm=None, clipvalue=None, global_clipnorm=None, name='Adam', beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False), adam_experimental=AdamExperimentalConfig(clipnorm=None, clipvalue=None, global_clipnorm=None, name='Adam', beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, jit_compile=False), adamw=AdamWeightDecayConfig(clipnorm=None, clipvalue=None, global_clipnorm=None, name='AdamWeightDecay', beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, weight_decay_rate=0.05, include_in_weight_decay=['^((?!depthwise).)*(kernel|weights):0$'], exclude_from_weight_decay=['(^((?!kernel).)*:0)|(depthwise_kernel)'], gradient_clip_norm=10.0), adamw_experimental=AdamWeightDecayExperimentalConfig(clipnorm=None, clipvalue=None, global_clipnorm=1.0, name='AdamWeightDecayExperimental', beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, weight_decay=0.0, jit_compile=False), lamb=LAMBConfig(clipnorm=None, clipvalue=None, global_clipnorm=None, name='LAMB', beta_1=0.9, beta_2=0.999, epsilon=1e-06, weight_decay_rate=0.0, exclude_from_weight_decay=None, exclude_from_layer_adaptation=None), rmsprop=RMSPropConfig(clipnorm=None, clipvalue=None, global_clipnorm=None, name='RMSprop', rho=0.9, momentum=0.0, epsilon=1e-07, centered=False), lars=LARSConfig(clipnorm=None, clipvalue=None, global_clipnorm=None, name='LARS', momentum=0.9, eeta=0.001, weight_decay_rate=0.0, nesterov=False, classic_momentum=True, exclude_from_weight_decay=None, exclude_from_layer_adaptation=None), adagrad=AdagradConfig(clipnorm=None, clipvalue=None, global_clipnorm=None, name='Adagrad', initial_accumulator_value=0.1, epsilon=1e-07), slide=SLIDEConfig(clipnorm=None, clipvalue=None, global_clipnorm=None, name='SLIDE', beta_1=0.9, beta_2=0.999, epsilon=1e-06, weight_decay_rate=0.0, weight_decay_type='inner', exclude_from_weight_decay=None, exclude_from_layer_adaptation=None, include_in_sparse_layer_adaptation=None, sparse_layer_learning_rate=0.1, do_gradient_rescaling=True, norm_type='layer', ratio_clip_norm=100000.0), adafactor=AdafactorConfig(clipnorm=None, clipvalue=None, global_clipnorm=None, name='Adafactor', factored=True, multiply_by_parameter_scale=True, beta1=None, decay_rate=0.8, step_offset=0, clipping_threshold=1.0, min_dim_size_to_factor=128, epsilon1=1e-30, epsilon2=0.001, weight_decay=None, include_in_weight_decay=None)), ema=None, learning_rate=LrConfig(type='cosine', constant=ConstantLrConfig(name='Constant', learning_rate=0.1), stepwise=StepwiseLrConfig(name='PiecewiseConstantDecay', boundaries=None, values=None, offset=0), exponential=ExponentialLrConfig(name='ExponentialDecay', initial_learning_rate=None, decay_steps=None, decay_rate=None, staircase=None, offset=0), polynomial=PolynomialLrConfig(name='PolynomialDecay', initial_learning_rate=None, decay_steps=None, end_learning_rate=0.0001, power=1.0, cycle=False, offset=0), cosine=CosineLrConfig(name='CosineDecay', initial_learning_rate=0.001, decay_steps=99000, alpha=0.01, offset=1000), power=DirectPowerLrConfig(name='DirectPowerDecay', initial_learning_rate=None, power=-0.5), power_linear=PowerAndLinearDecayLrConfig(name='PowerAndLinearDecay', initial_learning_rate=None, total_decay_steps=None, power=-0.5, linear_decay_fraction=0.1, offset=0), power_with_offset=PowerDecayWithOffsetLrConfig(name='PowerDecayWithOffset', initial_learning_rate=None, power=-0.5, offset=0, pre_offset_learning_rate=1000000.0), step_cosine_with_offset=StepCosineLrConfig(name='StepCosineDecayWithOffset', boundaries=None, values=None, offset=0)), warmup=WarmupConfig(type='linear', linear=LinearWarmupConfig(name='linear', warmup_learning_rate=1e-05, warmup_steps=1000), polynomial=PolynomialWarmupConfig(name='polynomial', power=1, warmup_steps=None))), train_tf_while_loop=True, train_tf_function=True, eval_tf_function=True, eval_tf_while_loop=False, allow_tpu_summary=False, steps_per_loop=20, summary_interval=20, checkpoint_interval=200, max_to_keep=1, continuous_eval_timeout=3600, train_steps=100000, validation_steps=-1, validation_interval=1000, best_checkpoint_export_subdir='', best_checkpoint_eval_metric='', best_checkpoint_metric_comp='higher', loss_upper_bound=1000000.0, recovery_begin_steps=0, recovery_max_trials=0, validation_summary_subdir='validation'), runtime=RuntimeConfig(distribution_strategy='mirrored', enable_xla=False, gpu_thread_mode=None, dataset_num_private_threads=None, per_gpu_thread_count=8, tpu=None, num_gpus=8, worker_hosts=None, task_index=-1, all_reduce_alg=None, num_packs=1, mixed_precision_dtype=None, loss_scale=None, run_eagerly=False, batchnorm_spatial_persistent=False, tpu_enable_xla_dynamic_padder=None, num_cores_per_replica=1, default_shard_dim=-1))
************************************
I0912 21:44:51.021377 140146006726464 train_utils.py:371] Saving experiment configuration to /ssd8/exec/code/tensorflow_models/unified_detector/ckpt_2/params.yaml
2022-09-12 21:44:51.046800: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-12 21:45:01.295635: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22309 MB memory:  -> device: 0, name: GeForce RTX 3090, pci bus id: 0000:1a:00.0, compute capability: 8.6
2022-09-12 21:45:01.298819: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 22309 MB memory:  -> device: 1, name: GeForce RTX 3090, pci bus id: 0000:1b:00.0, compute capability: 8.6
2022-09-12 21:45:01.301255: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 22309 MB memory:  -> device: 2, name: GeForce RTX 3090, pci bus id: 0000:3d:00.0, compute capability: 8.6
2022-09-12 21:45:01.303570: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 22309 MB memory:  -> device: 3, name: GeForce RTX 3090, pci bus id: 0000:3e:00.0, compute capability: 8.6
2022-09-12 21:45:01.306081: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:4 with 22309 MB memory:  -> device: 4, name: GeForce RTX 3090, pci bus id: 0000:88:00.0, compute capability: 8.6
2022-09-12 21:45:01.309360: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:5 with 22309 MB memory:  -> device: 5, name: GeForce RTX 3090, pci bus id: 0000:89:00.0, compute capability: 8.6
2022-09-12 21:45:01.311707: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:6 with 22309 MB memory:  -> device: 6, name: GeForce RTX 3090, pci bus id: 0000:b1:00.0, compute capability: 8.6
2022-09-12 21:45:01.313972: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:7 with 22309 MB memory:  -> device: 7, name: GeForce RTX 3090, pci bus id: 0000:b2:00.0, compute capability: 8.6
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4', '/job:localhost/replica:0/task:0/device:GPU:5', '/job:localhost/replica:0/task:0/device:GPU:6', '/job:localhost/replica:0/task:0/device:GPU:7')
I0912 21:45:04.401717 140146006726464 mirrored_strategy.py:374] Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4', '/job:localhost/replica:0/task:0/device:GPU:5', '/job:localhost/replica:0/task:0/device:GPU:6', '/job:localhost/replica:0/task:0/device:GPU:7')
I0912 21:45:04.404101 140146006726464 train_utils.py:245] Running default trainer.
I0912 21:45:04.406337 140146006726464 axial_resnet_instances.py:144] Axial-ResNet final config: {'num_blocks': [3, 4, 6, 3], 'backbone_layer_multiplier': 1.0, 'width_multiplier': 1.0, 'stem_width_multiplier': 1.0, 'output_stride': 16, 'classification_mode': False, 'backbone_type': 'resnet_beta', 'use_axial_beyond_stride': 16, 'backbone_use_transformer_beyond_stride': 16, 'extra_decoder_use_transformer_beyond_stride': 16, 'backbone_decoder_num_stacks': 0, 'backbone_decoder_blocks_per_stage': 1, 'extra_decoder_num_stacks': 0, 'extra_decoder_blocks_per_stage': 1, 'max_num_mask_slots': 384, 'num_mask_slots': 384, 'memory_channels': 256, 'base_transformer_expansion': 1.0, 'global_feed_forward_network_channels': 256, 'high_resolution_output_stride': 4, 'activation': 'relu', 'block_group_config': {'attention_bottleneck_expansion': 2, 'drop_path_keep_prob': 0.8, 'drop_path_beyond_stride': 16, 'drop_path_schedule': 'linear', 'positional_encoding_type': None, 'use_global_beyond_stride': 0, 'use_sac_beyond_stride': 0, 'use_squeeze_and_excite': False, 'conv_use_recompute_grad': False, 'axial_use_recompute_grad': False, 'recompute_within_stride': 0, 'transformer_use_recompute_grad': False, 'axial_layer_config': {'query_shape': (129, 129), 'key_expansion': 1, 'value_expansion': 2, 'memory_flange': (32, 32), 'double_global_attention': False, 'num_heads': 8, 'use_query_rpe_similarity': True, 'use_key_rpe_similarity': True, 'use_content_similarity': True, 'retrieve_value_rpe': True, 'retrieve_value_content': True, 'initialization_std_for_query_key_rpe': 1.0, 'initialization_std_for_value_rpe': 1.0, 'self_attention_activation': 'softmax'}, 'dual_path_transformer_layer_config': {'num_heads': 8, 'bottleneck_expansion': 2, 'key_expansion': 1, 'value_expansion': 2, 'feed_forward_network_channels': 2048, 'use_memory_self_attention': True, 'use_pixel2memory_feedback_attention': True, 'transformer_activation': 'softmax'}}, 'bn_layer': <class 'official.projects.unified_detector.modeling.universal_detector.LayerNorm'>, 'conv_kernel_weight_decay': 0.0}
I0912 21:45:04.946798 140146006726464 legacy_adamw.py:56] AdamWeightDecay gradient_clip_norm=10.000000
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0912 21:45:04.972194 140146006726464 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0912 21:45:04.976447 140146006726464 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0912 21:45:04.988127 140146006726464 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0912 21:45:04.990544 140146006726464 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0912 21:45:05.002947 140146006726464 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0912 21:45:05.005227 140146006726464 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0912 21:45:05.014395 140146006726464 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0912 21:45:05.016551 140146006726464 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0912 21:45:05.032051 140146006726464 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0912 21:45:05.034090 140146006726464 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0912 21:45:10.043252 140146006726464 controller.py:393] restoring or initializing model...
restoring or initializing model...
I0912 21:45:10.043418 140146006726464 base_task.py:124] Trying to load pretrained checkpoint from
I0912 21:45:10.043476 140146006726464 base_task.py:129] No checkpoint file found from . Will not load.
I0912 21:45:10.043531 140146006726464 controller.py:399] initialized model.
initialized model.
I0912 21:45:10.043581 140146006726464 train_lib.py:212] Starts to execute mode: train
I0912 21:45:10.044729 140146006726464 controller.py:241] train | step:      0 | training until step 100000...
train | step:      0 | training until step 100000...
INFO:tensorflow:batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
I0912 21:46:19.261677 140146006726464 cross_device_ops.py:900] batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
I0912 21:46:19.365479 140146006726464 cross_device_ops.py:900] batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
I0912 21:46:19.464072 140146006726464 cross_device_ops.py:900] batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
I0912 21:46:20.720240 140146006726464 cross_device_ops.py:900] batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
I0912 21:46:20.895978 140146006726464 cross_device_ops.py:900] batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
I0912 21:46:21.045560 140146006726464 cross_device_ops.py:900] batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
I0912 21:46:22.125288 140146006726464 cross_device_ops.py:900] batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
I0912 21:46:22.298850 140146006726464 cross_device_ops.py:900] batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
I0912 21:46:22.448728 140146006726464 cross_device_ops.py:900] batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
I0912 21:46:23.512996 140146006726464 cross_device_ops.py:900] batch_all_reduce: 1 all-reduces with algorithm = nccl, num_packs = 1
WARNING:tensorflow:From /ssd8/exec/miniconda3/envs/tf/lib/python3.7/site-packages/tensorflow/python/util/dispatch.py:1082: calling foldl_v2 (from tensorflow.python.ops.functional_ops) with back_prop=False is deprecated and will be removed in a future version.
Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldl(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldl(fn, elems))
W0912 21:46:47.816133 139915463485184 deprecation.py:628] From /ssd8/exec/miniconda3/envs/tf/lib/python3.7/site-packages/tensorflow/python/util/dispatch.py:1082: calling foldl_v2 (from tensorflow.python.ops.functional_ops) with back_prop=False is deprecated and will be removed in a future version.
Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldl(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldl(fn, elems))
WARNING:tensorflow:From /ssd8/exec/miniconda3/envs/tf/lib/python3.7/site-packages/tensorflow/python/autograph/impl/api.py:458: calling while_loop_v2 (from tensorflow.python.ops.control_flow_ops) with back_prop=False is deprecated and will be removed in a future version.
Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))
W0912 21:46:48.055929 139915463485184 deprecation.py:628] From /ssd8/exec/miniconda3/envs/tf/lib/python3.7/site-packages/tensorflow/python/autograph/impl/api.py:458: calling while_loop_v2 (from tensorflow.python.ops.control_flow_ops) with back_prop=False is deprecated and will be removed in a future version.
Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))
2022-09-12 21:57:46.533575: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100
2022-09-12 21:57:48.794548: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100
2022-09-12 21:57:50.562443: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100
2022-09-12 21:57:58.537018: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100
2022-09-12 21:57:59.074211: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2022-09-12 21:58:05.047480: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100
2022-09-12 21:58:13.330499: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100
2022-09-12 21:58:21.847865: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100
2022-09-12 21:58:27.431636: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100
2022-09-12 21:58:39.283256: W tensorflow/core/common_runtime/bfc_allocator.cc:290] Allocator (GPU_5_bfc) ran out of memory trying to allocate 2.88GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2022-09-12 21:58:39.283431: W tensorflow/core/common_runtime/bfc_allocator.cc:290] Allocator (GPU_5_bfc) ran out of memory trying to allocate 2.88GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2022-09-12 21:58:39.292711: W tensorflow/core/common_runtime/bfc_allocator.cc:290] Allocator (GPU_4_bfc) ran out of memory trying to allocate 2.81GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2022-09-12 21:58:39.292775: W tensorflow/core/common_runtime/bfc_allocator.cc:290] Allocator (GPU_4_bfc) ran out of memory trying to allocate 2.81GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2022-09-12 21:58:39.297796: W tensorflow/core/common_runtime/bfc_allocator.cc:290] Allocator (GPU_1_bfc) ran out of memory trying to allocate 2.81GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2022-09-12 21:58:39.297847: W tensorflow/core/common_runtime/bfc_allocator.cc:290] Allocator (GPU_1_bfc) ran out of memory trying to allocate 2.81GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
I0912 22:04:27.680823 140146006726464 controller.py:466] train | step:     20 | steps/sec:    0.0 | output:
    {'learning_rate': 2.9800001e-05,
     'loss_inst_dist': 2.9058683,
     'loss_mask_id': 5.888561,
     'loss_para': 1.4637133,
     'loss_pq': 0.026584303,
     'loss_segmentation_word': 0.7653519,
     'training_loss': 0.65190935}
train | step:     20 | steps/sec:    0.0 | output:
    {'learning_rate': 2.9800001e-05,
     'loss_inst_dist': 2.9058683,
     'loss_mask_id': 5.888561,
     'loss_para': 1.4637133,
     'loss_pq': 0.026584303,
     'loss_segmentation_word': 0.7653519,
     'training_loss': 0.65190935}
I0912 22:04:30.862984 140146006726464 controller.py:495] saved checkpoint to /ssd8/exec/code/tensorflow_models/unified_detector/ckpt_2/ckpt-20.
saved checkpoint to /ssd8/exec/code/tensorflow_models/unified_detector/ckpt_2/ckpt-20.
I0912 22:09:23.639716 140146006726464 controller.py:466] train | step:     40 | steps/sec:    0.1 | output:
    {'learning_rate': 4.9600003e-05,
     'loss_inst_dist': 2.8294528,
     'loss_mask_id': 5.8853784,
     'loss_para': 0.7080146,
     'loss_pq': 0.022374222,
     'loss_segmentation_word': 0.769679,
     'training_loss': 0.5468571}
train | step:     40 | steps/sec:    0.1 | output:
    {'learning_rate': 4.9600003e-05,
     'loss_inst_dist': 2.8294528,
     'loss_mask_id': 5.8853784,
     'loss_para': 0.7080146,
     'loss_pq': 0.022374222,
     'loss_segmentation_word': 0.769679,
     'training_loss': 0.5468571}
I0912 22:15:05.995907 140146006726464 controller.py:466] train | step:     60 | steps/sec:    0.1 | output:
    {'learning_rate': 6.9400005e-05,
     'loss_inst_dist': 2.7996793,
     'loss_mask_id': 5.8745537,
     'loss_para': 0.6333524,
     'loss_pq': 0.017164173,
     'loss_segmentation_word': 0.7499579,
     'training_loss': 0.5293837}
train | step:     60 | steps/sec:    0.1 | output:
    {'learning_rate': 6.9400005e-05,
     'loss_inst_dist': 2.7996793,
     'loss_mask_id': 5.8745537,
     'loss_para': 0.6333524,
     'loss_pq': 0.017164173,
     'loss_segmentation_word': 0.7499579,
     'training_loss': 0.5293837}
I0912 22:20:47.657936 140146006726464 controller.py:466] train | step:     80 | steps/sec:    0.1 | output:
    {'learning_rate': 8.920001e-05,
     'loss_inst_dist': 2.7199156,
     'loss_mask_id': 5.8525662,
     'loss_para': 0.62838936,
     'loss_pq': 0.012765506,
     'loss_segmentation_word': 0.7151447,
     'training_loss': 0.51279145}
train | step:     80 | steps/sec:    0.1 | output:
    {'learning_rate': 8.920001e-05,
     'loss_inst_dist': 2.7199156,
     'loss_mask_id': 5.8525662,
     'loss_para': 0.62838936,
     'loss_pq': 0.012765506,
     'loss_segmentation_word': 0.7151447,
     'training_loss': 0.51279145}
I0912 22:26:03.238194 140146006726464 controller.py:466] train | step:    100 | steps/sec:    0.1 | output:
    {'learning_rate': 0.000109000015,
     'loss_inst_dist': 2.6556196,
     'loss_mask_id': 5.8127174,
     'loss_para': 0.62452364,
     'loss_pq': 0.0095191505,
     'loss_segmentation_word': 0.6863466,
     'training_loss': 0.4994536}
train | step:    100 | steps/sec:    0.1 | output:
    {'learning_rate': 0.000109000015,
     'loss_inst_dist': 2.6556196,
     'loss_mask_id': 5.8127174,
     'loss_para': 0.62452364,
     'loss_pq': 0.0095191505,
     'loss_segmentation_word': 0.6863466,
     'training_loss': 0.4994536}
I0912 22:32:35.755482 140146006726464 controller.py:466] train | step:    120 | steps/sec:    0.1 | output:
    {'learning_rate': 0.0001288,
     'loss_inst_dist': 2.595471,
     'loss_mask_id': 5.7648664,
     'loss_para': 0.6137519,
     'loss_pq': 0.007362091,
     'loss_segmentation_word': 0.65818495,
     'training_loss': 0.4862589}
train | step:    120 | steps/sec:    0.1 | output:
    {'learning_rate': 0.0001288,
     'loss_inst_dist': 2.595471,
     'loss_mask_id': 5.7648664,
     'loss_para': 0.6137519,
     'loss_pq': 0.007362091,
     'loss_segmentation_word': 0.65818495,
     'training_loss': 0.4862589}
I0912 22:38:52.027837 140146006726464 controller.py:466] train | step:    140 | steps/sec:    0.1 | output:
    {'learning_rate': 0.00014860001,
     'loss_inst_dist': 2.5431335,
     'loss_mask_id': 5.674978,
     'loss_para': 0.62526786,
     'loss_pq': 0.0059654233,
     'loss_segmentation_word': 0.6171484,
     'training_loss': 0.4755017}
train | step:    140 | steps/sec:    0.1 | output:
    {'learning_rate': 0.00014860001,
     'loss_inst_dist': 2.5431335,
     'loss_mask_id': 5.674978,
     'loss_para': 0.62526786,
     'loss_pq': 0.0059654233,
     'loss_segmentation_word': 0.6171484,
     'training_loss': 0.4755017}
I0912 22:45:08.585236 140146006726464 controller.py:466] train | step:    160 | steps/sec:    0.1 | output:
    {'learning_rate': 0.00016840002,
     'loss_inst_dist': 2.5124724,
     'loss_mask_id': 5.5821342,
     'loss_para': 0.6277096,
     'loss_pq': 0.005056196,
     'loss_segmentation_word': 0.6002871,
     'training_loss': 0.46952447}
train | step:    160 | steps/sec:    0.1 | output:
    {'learning_rate': 0.00016840002,
     'loss_inst_dist': 2.5124724,
     'loss_mask_id': 5.5821342,
     'loss_para': 0.6277096,
     'loss_pq': 0.005056196,
     'loss_segmentation_word': 0.6002871,
     'training_loss': 0.46952447}
I0912 22:51:21.874272 140146006726464 controller.py:466] train | step:    180 | steps/sec:    0.1 | output:
    {'learning_rate': 0.00018820002,
     'loss_inst_dist': 2.357241,
     'loss_mask_id': 5.433732,
     'loss_para': 0.62334794,
     'loss_pq': 0.0044937204,
     'loss_segmentation_word': 0.56981546,
     'training_loss': 0.4455536}
train | step:    180 | steps/sec:    0.1 | output:
    {'learning_rate': 0.00018820002,
     'loss_inst_dist': 2.357241,
     'loss_mask_id': 5.433732,
     'loss_para': 0.62334794,
     'loss_pq': 0.0044937204,
     'loss_segmentation_word': 0.56981546,
     'training_loss': 0.4455536}
I0912 22:57:55.508177 140146006726464 controller.py:466] train | step:    200 | steps/sec:    0.1 | output:
    {'learning_rate': 0.00020800003,
     'loss_inst_dist': 2.429025,
     'loss_mask_id': 5.3139553,
     'loss_para': 0.619613,
     'loss_pq': 0.0046887393,
     'loss_segmentation_word': 0.565227,
     'training_loss': 0.45355788}
train | step:    200 | steps/sec:    0.1 | output:
    {'learning_rate': 0.00020800003,
     'loss_inst_dist': 2.429025,
     'loss_mask_id': 5.3139553,
     'loss_para': 0.619613,
     'loss_pq': 0.0046887393,
     'loss_segmentation_word': 0.565227,
     'training_loss': 0.45355788}
I0912 23:04:04.186987 140146006726464 controller.py:466] train | step:    220 | steps/sec:    0.1 | output:
    {'learning_rate': 0.00022780002,
     'loss_inst_dist': 2.3147283,
     'loss_mask_id': 5.140997,
     'loss_para': 0.63087285,
     'loss_pq': 0.004355564,
     'loss_segmentation_word': 0.54870176,
     'training_loss': 0.43848544}
train | step:    220 | steps/sec:    0.1 | output:
    {'learning_rate': 0.00022780002,
     'loss_inst_dist': 2.3147283,
     'loss_mask_id': 5.140997,
     'loss_para': 0.63087285,
     'loss_pq': 0.004355564,
     'loss_segmentation_word': 0.54870176,
     'training_loss': 0.43848544}
I0912 23:04:07.480966 140146006726464 controller.py:495] saved checkpoint to /ssd8/exec/code/tensorflow_models/unified_detector/ckpt_2/ckpt-220.
saved checkpoint to /ssd8/exec/code/tensorflow_models/unified_detector/ckpt_2/ckpt-220.
bdI0912 23:10:15.393596 140146006726464 controller.py:466] train | step:    240 | steps/sec:    0.1 | output:
    {'learning_rate': 0.00024760002,
     'loss_inst_dist': 2.3051333,
     'loss_mask_id': 4.862335,
     'loss_para': 0.62140954,
     'loss_pq': 0.004350334,
     'loss_segmentation_word': 0.55073804,
     'training_loss': 0.43635225}
train | step:    240 | steps/sec:    0.1 | output:
    {'learning_rate': 0.00024760002,
     'loss_inst_dist': 2.3051333,
     'loss_mask_id': 4.862335,
     'loss_para': 0.62140954,
     'loss_pq': 0.004350334,
     'loss_segmentation_word': 0.55073804,
     'training_loss': 0.43635225}
I0912 23:16:55.314216 140146006726464 controller.py:466] train | step:    260 | steps/sec:    0.1 | output:
    {'learning_rate': 0.0002674,
     'loss_inst_dist': 2.2381146,
     'loss_mask_id': 4.522301,
     'loss_para': 0.62252784,
     'loss_pq': 0.004659121,
     'loss_segmentation_word': 0.53306484,
     'training_loss': 0.42601705}
train | step:    260 | steps/sec:    0.1 | output:
    {'learning_rate': 0.0002674,
     'loss_inst_dist': 2.2381146,
     'loss_mask_id': 4.522301,
     'loss_para': 0.62252784,
     'loss_pq': 0.004659121,
     'loss_segmentation_word': 0.53306484,
     'training_loss': 0.42601705}
I0912 23:24:11.048573 140146006726464 controller.py:466] train | step:    280 | steps/sec:    0.0 | output:
    {'learning_rate': 0.0002872,
     'loss_inst_dist': 2.1775403,
     'loss_mask_id': 4.101429,
     'loss_para': 0.6138663,
     'loss_pq': 0.004860651,
     'loss_segmentation_word': 0.5418912,
     'training_loss': 0.41853625}
train | step:    280 | steps/sec:    0.0 | output:
    {'learning_rate': 0.0002872,
     'loss_inst_dist': 2.1775403,
     'loss_mask_id': 4.101429,
     'loss_para': 0.6138663,
     'loss_pq': 0.004860651,
     'loss_segmentation_word': 0.5418912,
     'training_loss': 0.41853625}
I0912 23:31:26.761047 140146006726464 controller.py:466] train | step:    300 | steps/sec:    0.0 | output:
    {'learning_rate': 0.00030700004,
     'loss_inst_dist': 2.1884832,
     'loss_mask_id': 3.8205318,
     'loss_para': 0.6208272,
     'loss_pq': 0.004824784,
     'loss_segmentation_word': 0.54650056,
     'training_loss': 0.42133346}
train | step:    300 | steps/sec:    0.0 | output:
    {'learning_rate': 0.00030700004,
     'loss_inst_dist': 2.1884832,
     'loss_mask_id': 3.8205318,
     'loss_para': 0.6208272,
     'loss_pq': 0.004824784,
     'loss_segmentation_word': 0.54650056,
     'training_loss': 0.42133346}
I0912 23:42:14.331473 140146006726464 controller.py:466] train | step:    320 | steps/sec:    0.0 | output:
    {'learning_rate': 0.00032680001,
     'loss_inst_dist': 2.1451507,
     'loss_mask_id': 3.5037308,
     'loss_para': 0.61766475,
     'loss_pq': 0.004801019,
     'loss_segmentation_word': 0.51266253,
     'training_loss': 0.4112789}
train | step:    320 | steps/sec:    0.0 | output:
    {'learning_rate': 0.00032680001,
     'loss_inst_dist': 2.1451507,
     'loss_mask_id': 3.5037308,
     'loss_para': 0.61766475,
     'loss_pq': 0.004801019,
     'loss_segmentation_word': 0.51266253,
     'training_loss': 0.4112789}
I0912 23:53:44.713991 140146006726464 controller.py:466] train | step:    340 | steps/sec:    0.0 | output:
    {'learning_rate': 0.00034660002,
     'loss_inst_dist': 2.0532985,
     'loss_mask_id': 3.2056527,
     'loss_para': 0.62935,
     'loss_pq': 0.003703497,
     'loss_segmentation_word': 0.5110657,
     'training_loss': 0.40064317}
train | step:    340 | steps/sec:    0.0 | output:
    {'learning_rate': 0.00034660002,
     'loss_inst_dist': 2.0532985,
     'loss_mask_id': 3.2056527,
     'loss_para': 0.62935,
     'loss_pq': 0.003703497,
     'loss_segmentation_word': 0.5110657,
     'training_loss': 0.40064317}
I0913 00:07:16.606030 140146006726464 controller.py:466] train | step:    360 | steps/sec:    0.0 | output:
    {'learning_rate': 0.00036640003,
     'loss_inst_dist': 2.0203195,
     'loss_mask_id': 2.9283638,
     'loss_para': 0.6039487,
     'loss_pq': 0.0027603046,
     'loss_segmentation_word': 0.48058254,
     'training_loss': 0.38917798}
train | step:    360 | steps/sec:    0.0 | output:
    {'learning_rate': 0.00036640003,
     'loss_inst_dist': 2.0203195,
     'loss_mask_id': 2.9283638,
     'loss_para': 0.6039487,
     'loss_pq': 0.0027603046,
     'loss_segmentation_word': 0.48058254,
     'training_loss': 0.38917798}
I0913 00:26:37.299056 140146006726464 controller.py:466] train | step:    380 | steps/sec:    0.0 | output:
    {'learning_rate': 0.00038620003,
     'loss_inst_dist': 1.9350437,
     'loss_mask_id': 2.57786,
     'loss_para': 0.60475326,
     'loss_pq': 0.0019505906,
     'loss_segmentation_word': 0.4727227,
     'training_loss': 0.3773287}
train | step:    380 | steps/sec:    0.0 | output:
    {'learning_rate': 0.00038620003,
     'loss_inst_dist': 1.9350437,
     'loss_mask_id': 2.57786,
     'loss_para': 0.60475326,
     'loss_pq': 0.0019505906,
     'loss_segmentation_word': 0.4727227,
     'training_loss': 0.3773287}
I0913 00:47:26.199129 140146006726464 controller.py:466] train | step:    400 | steps/sec:    0.0 | output:
    {'learning_rate': 0.00040600004,
     'loss_inst_dist': 1.9235398,
     'loss_mask_id': 2.33565,
     'loss_para': 0.6091674,
     'loss_pq': 0.0013025634,
     'loss_segmentation_word': 0.4900714,
     'training_loss': 0.37836498}
train | step:    400 | steps/sec:    0.0 | output:
    {'learning_rate'
google-research-datasets / hiertext

Training is slow #6