aws-samples / amazon-sagemaker-tensorflow-object-detection-api

Train and deploy models using TensorFlow 2 with the Object Detection API on Amazon SageMaker
MIT No Attribution
42 stars 34 forks source link

not able to train the model on different region using ml.g4dn.2xlarge for training #35

Open rhitwijweldxit opened 1 year ago

rhitwijweldxit commented 1 year ago

INFO:sagemaker:Creating training-job with name: tf2-object-detection-2023-04-15-10-29-55-436 2023-04-15 10:29:57 Starting - Starting the training job... 2023-04-15 10:30:13 Starting - Preparing the instances for training... 2023-04-15 10:30:55 Downloading - Downloading input data... 2023-04-15 10:31:20 Training - Downloading the training image......... 2023-04-15 10:32:51 Training - Training image download completed. Training in progress....2023-04-15 10:33:19,147 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed) 2023-04-15 10:33:19,183 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed) 2023-04-15 10:33:19,220 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed) 2023-04-15 10:33:19,232 sagemaker-training-toolkit INFO Invoking user script Training Env: { "additional_framework_parameters": {}, "channel_input_dirs": { "train": "/opt/ml/input/data/train" }, "current_host": "algo-1", "current_instance_group": "homogeneousCluster", "current_instance_group_hosts": [ "algo-1" ], "current_instance_type": "ml.g4dn.2xlarge", "distribution_hosts": [], "distribution_instance_groups": [], "framework_module": null, "hosts": [ "algo-1" ], "hyperparameters": { "model_dir": "/opt/training", "num_train_steps": "1000", "pipeline_config_path": "pipeline.config", "sample_1_of_n_eval_examples": "1" }, "input_config_dir": "/opt/ml/input/config", "input_data_config": { "train": { "TrainingInputMode": "File", "S3DistributionType": "FullyReplicated", "RecordWrapperType": "None" } }, "input_dir": "/opt/ml/input", "instance_groups": [ "homogeneousCluster" ], "instance_groups_dict": { "homogeneousCluster": { "instance_group_name": "homogeneousCluster", "instance_type": "ml.g4dn.2xlarge", "hosts": [ "algo-1" ] } }, "is_hetero": false, "is_master": true, "is_modelparallel_enabled": null, "is_smddpmprun_installed": false, "job_name": "tf2-object-detection-2023-04-15-10-29-55-436", "log_level": 20, "master_hostname": "algo-1", "model_dir": "/opt/ml/model", "module_dir": "s3://sagemaker-ap-south-1-657101763531/tf2-object-detection-2023-04-15-10-29-55-436/source/sourcedir.tar.gz", "module_name": "run_training.sh", "network_interface_name": "eth0", "num_cpus": 8, "num_gpus": 1, "num_neurons": 0, "output_data_dir": "/opt/ml/output/data", "output_dir": "/opt/ml/output", "output_intermediate_dir": "/opt/ml/output/intermediate", "resource_config": { "current_host": "algo-1", "current_instance_type": "ml.g4dn.2xlarge", "current_group_name": "homogeneousCluster", "hosts": [ "algo-1" ], "instance_groups": [ { "instance_group_name": "homogeneousCluster", "instance_type": "ml.g4dn.2xlarge", "hosts": [ "algo-1" ] } ], "network_interface_name": "eth0" }, "user_entry_point": "run_training.sh" } Environment variables: SM_HOSTS=["algo-1"] SM_NETWORK_INTERFACE_NAME=eth0 SM_HPS={"model_dir":"/opt/training","num_train_steps":"1000","pipeline_config_path":"pipeline.config","sample_1_of_n_eval_examples":"1"} SM_USER_ENTRY_POINT=run_training.sh SM_FRAMEWORK_PARAMS={} SM_RESOURCE_CONFIG={"current_group_name":"homogeneousCluster","current_host":"algo-1","current_instance_type":"ml.g4dn.2xlarge","hosts":["algo-1"],"instance_groups":[{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.2xlarge"}],"network_interface_name":"eth0"} SM_INPUT_DATA_CONFIG={"train":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}} SM_OUTPUT_DATA_DIR=/opt/ml/output/data SM_CHANNELS=["train"] SM_CURRENT_HOST=algo-1 SM_CURRENT_INSTANCE_TYPE=ml.g4dn.2xlarge SM_CURRENT_INSTANCE_GROUP=homogeneousCluster SM_CURRENT_INSTANCE_GROUP_HOSTS=["algo-1"] SM_INSTANCE_GROUPS=["homogeneousCluster"] SM_INSTANCE_GROUPS_DICT={"homogeneousCluster":{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.2xlarge"}} SM_DISTRIBUTION_INSTANCE_GROUPS=[] SM_IS_HETERO=false SM_MODULE_NAME=run_training.sh SM_LOG_LEVEL=20 SM_FRAMEWORK_MODULE= SM_INPUT_DIR=/opt/ml/input SM_INPUT_CONFIG_DIR=/opt/ml/input/config SM_OUTPUT_DIR=/opt/ml/output SM_NUM_CPUS=8 SM_NUM_GPUS=1 SM_NUM_NEURONS=0 SM_MODEL_DIR=/opt/ml/model SM_MODULE_DIR=s3://sagemaker-ap-south-1-657101763531/tf2-object-detection-2023-04-15-10-29-55-436/source/sourcedir.tar.gz SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{"train":"/opt/ml/input/data/train"},"current_host":"algo-1","current_instance_group":"homogeneousCluster","current_instance_group_hosts":["algo-1"],"current_instance_type":"ml.g4dn.2xlarge","distribution_hosts":[],"distribution_instance_groups":[],"framework_module":null,"hosts":["algo-1"],"hyperparameters":{"model_dir":"/opt/training","num_train_steps":"1000","pipeline_config_path":"pipeline.config","sample_1_of_n_eval_examples":"1"},"input_config_dir":"/opt/ml/input/config","input_data_config":{"train":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}},"input_dir":"/opt/ml/input","instance_groups":["homogeneousCluster"],"instance_groups_dict":{"homogeneousCluster":{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.2xlarge"}},"is_hetero":false,"is_master":true,"is_modelparallel_enabled":null,"is_smddpmprun_installed":false,"job_name":"tf2-object-detection-2023-04-15-10-29-55-436","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-ap-south-1-657101763531/tf2-object-detection-2023-04-15-10-29-55-436/source/sourcedir.tar.gz","module_name":"run_training.sh","network_interface_name":"eth0","num_cpus":8,"num_gpus":1,"num_neurons":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_group_name":"homogeneousCluster","current_host":"algo-1","current_instance_type":"ml.g4dn.2xlarge","hosts":["algo-1"],"instance_groups":[{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.2xlarge"}],"network_interface_name":"eth0"},"user_entry_point":"run_training.sh"} SM_USER_ARGS=["--model_dir","/opt/training","--num_train_steps","1000","--pipeline_config_path","pipeline.config","--sample_1_of_n_eval_examples","1"] SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate SM_CHANNEL_TRAIN=/opt/ml/input/data/train SM_HP_MODEL_DIR=/opt/training SM_HP_NUM_TRAIN_STEPS=1000 SM_HP_PIPELINE_CONFIG_PATH=pipeline.config SM_HP_SAMPLE_1_OF_N_EVAL_EXAMPLES=1 PYTHONPATH=/opt/ml/code:/usr/local/bin:/usr/lib/python38.zip:/usr/lib/python3.8:/usr/lib/python3.8/lib-dynload:/usr/local/lib/python3.8/dist-packages:/usr/lib/python3/dist-packages Invoking script with the following command: /bin/sh -c "./run_training.sh --model_dir /opt/training --num_train_steps 1000 --pipeline_config_path pipeline.config --sample_1_of_n_eval_examples 1" 2023-04-15 10:33:19,232 sagemaker-training-toolkit INFO Exceptions not imported for SageMaker Debugger as it is not installed. ===TRAINING THE MODEL== /usr/local/lib/python3.8/dist-packages/tensorflow_addons/utils/tfa_eol_msg.py:23: UserWarning: TensorFlow Addons (TFA) has ended development and introduction of new features. TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024. Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). For more information see: https://github.com/tensorflow/addons/issues/2807 warnings.warn( INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',) I0415 10:33:26.837829 140166345439040 mirrored_strategy.py:374] Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',) INFO:tensorflow:Maybe overwriting train_steps: 1000 I0415 10:33:26.841578 140166345439040 config_util.py:552] Maybe overwriting train_steps: 1000 INFO:tensorflow:Maybe overwriting use_bfloat16: False I0415 10:33:26.841733 140166345439040 config_util.py:552] Maybe overwriting use_bfloat16: False I0415 10:33:26.853476 140166345439040 ssd_efficientnet_bifpn_feature_extractor.py:150] EfficientDet EfficientNet backbone version: efficientnet-b1 I0415 10:33:26.853606 140166345439040 ssd_efficientnet_bifpn_feature_extractor.py:152] EfficientDet BiFPN num filters: 88 I0415 10:33:26.853662 140166345439040 ssd_efficientnet_bifpn_feature_extractor.py:153] EfficientDet BiFPN num iterations: 4 I0415 10:33:26.857728 140166345439040 efficientnet_model.py:143] round_filter input=32 output=32 INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). I0415 10:33:26.903624 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). I0415 10:33:26.907572 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). I0415 10:33:26.914127 140166345439040 efficientnet_model.py:143] round_filter input=32 output=32 I0415 10:33:26.914241 140166345439040 efficientnet_model.py:143] round_filter input=16 output=16 INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). I0415 10:33:26.936754 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). I0415 10:33:26.939711 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). I0415 10:33:27.006097 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). I0415 10:33:27.009125 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). I0415 10:33:27.034329 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). I0415 10:33:27.037334 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). I0415 10:33:27.100979 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). I0415 10:33:27.103941 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',). I0415 10:33:27.114144 140166345439040 efficientnet_model.py:143] round_filter input=16 output=16 I0415 10:33:27.114249 140166345439040 efficientnet_model.py:143] round_filter input=24 output=24 I0415 10:33:27.485301 140166345439040 efficientnet_model.py:143] round_filter input=24 output=24 I0415 10:33:27.485446 140166345439040 efficientnet_model.py:143] round_filter input=40 output=40 I0415 10:33:27.854993 140166345439040 efficientnet_model.py:143] round_filter input=40 output=40 I0415 10:33:27.855136 140166345439040 efficientnet_model.py:143] round_filter input=80 output=80 I0415 10:33:28.334065 140166345439040 efficientnet_model.py:143] round_filter input=80 output=80 I0415 10:33:28.334210 140166345439040 efficientnet_model.py:143] round_filter input=112 output=112 I0415 10:33:28.843916 140166345439040 efficientnet_model.py:143] round_filter input=112 output=112 I0415 10:33:28.844068 140166345439040 efficientnet_model.py:143] round_filter input=192 output=192 I0415 10:33:29.612781 140166345439040 efficientnet_model.py:143] round_filter input=192 output=192 I0415 10:33:29.612927 140166345439040 efficientnet_model.py:143] round_filter input=320 output=320 I0415 10:33:29.859471 140166345439040 efficientnet_model.py:143] round_filter input=1280 output=1280 I0415 10:33:29.914009 140166345439040 efficientnet_model.py:453] Building model efficientnet with params ModelConfig(width_coefficient=1.0, depth_coefficient=1.1, resolution=240, dropout_rate=0.2, blocks=(BlockConfig(input_filters=32, output_filters=16, kernel_size=3, num_repeat=1, expand_ratio=1, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=16, output_filters=24, kernel_size=3, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=24, output_filters=40, kernel_size=5, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=40, output_filters=80, kernel_size=3, num_repeat=3, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=80, output_filters=112, kernel_size=5, num_repeat=3, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=112, output_filters=192, kernel_size=5, num_repeat=4, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=192, output_filters=320, kernel_size=3, num_repeat=1, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise')), stem_base_filters=32, top_base_filters=1280, activation='simple_swish', batch_norm='default', bn_momentum=0.99, bn_epsilon=0.001, weight_decay=5e-06, drop_connect_rate=0.2, depth_divisor=8, min_depth=None, use_se=True, input_channels=3, num_classes=1000, model_name='efficientnet', rescale_input=False, data_format='channels_last', dtype='float32') WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py:563: StrategyBase.experimental_distribute_datasets_from_function (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version. Instructions for updating: rename to distribute_datasets_from_function W0415 10:33:29.959440 140166345439040 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py:563: StrategyBase.experimental_distribute_datasets_from_function (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version. Instructions for updating: rename to distribute_datasets_from_function INFO:tensorflow:Reading unweighted datasets: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/train.records'] I0415 10:33:30.210091 140166345439040 dataset_builder.py:162] Reading unweighted datasets: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/train.records'] INFO:tensorflow:Reading record datasets for input file: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/train.records'] I0415 10:33:30.228283 140166345439040 dataset_builder.py:79] Reading record datasets for input file: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/train.records'] INFO:tensorflow:Number of filenames to read: 1 I0415 10:33:30.228426 140166345439040 dataset_builder.py:80] Number of filenames to read: 1 WARNING:tensorflow:num_readers has been reduced to 1 to match input file shards. W0415 10:33:30.228483 140166345439040 dataset_builder.py:86] num_readers has been reduced to 1 to match input file shards. WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:100: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.AUTOTUNE) instead. If sloppy execution is desired, use tf.data.Options.deterministic. W0415 10:33:30.238938 140166345439040 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:100: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.AUTOTUNE) instead. If sloppy execution is desired, use tf.data.Options.deterministic. WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:235: DatasetV1.map_with_legacy_function (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.data.Dataset.map() W0415 10:33:30.255320 140166345439040 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:235: DatasetV1.map_with_legacy_function (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version. Instructions for updating: Usetf.data.Dataset.map() WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version. Instructions for updating: Create a tf.sparse.SparseTensor and use tf.sparse.to_dense instead. W0415 10:33:36.161500 140166345439040 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version. Instructions for updating: Create a tf.sparse.SparseTensor and use tf.sparse.to_dense instead. WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.cast instead. W0415 10:33:39.770519 140166345439040 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.cast instead. WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/optimizer_builder.py:124: The name tf.keras.optimizers.SGD is deprecated. Please use tf.keras.optimizers.legacy.SGD instead. W0415 10:33:41.886407 140166345439040 module_wrapper.py:149] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/optimizer_builder.py:124: The name tf.keras.optimizers.SGD is deprecated. Please use tf.keras.optimizers.legacy.SGD instead. /usr/local/lib/python3.8/dist-packages/keras/backend.py:452: UserWarning: tf.keras.backend.set_learning_phase is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the training argument of the __call__ method of your layer or model. warnings.warn( I0415 10:33:56.750935 140143522457344 api.py:459] feature_map_spatial_dims: [(80, 80), (40, 40), (20, 20), (10, 10), (5, 5)] I0415 10:34:07.238219 140143522457344 api.py:459] feature_map_spatial_dims: [(80, 80), (40, 40), (20, 20), (10, 10), (5, 5)] Traceback (most recent call last): File "model_main_tf2.py", line 114, in tf.compat.v1.app.run() File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/platform/app.py", line 36, in run _run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef) File "/usr/local/lib/python3.8/dist-packages/absl/app.py", line 308, in run _run_main(main, args) File "/usr/local/lib/python3.8/dist-packages/absl/app.py", line 254, in _run_main sys.exit(main(argv)) File "model_main_tf2.py", line 105, in main model_lib_v2.train_loop( File "/usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py", line 605, in train_loop load_fine_tune_checkpoint( File "/usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py", line 401, in load_fine_tune_checkpoint _ensure_model_is_built(model, input_dataset, unpad_groundtruth_tensors) File "/usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py", line 176, in _ensure_model_is_built strategy.run( File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/distribute/distribute_lib.py", line 1316, in run return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs) File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/distribute/distribute_lib.py", line 2895, in call_for_each_replica return self._call_for_each_replica(fn, args, kwargs) File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/distribute/mirrored_strategy.py", line 696, in _call_for_each_replica return mirrored_run.call_for_each_replica( File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/distribute/mirrored_run.py", line 84, in call_for_each_replica return wrapped(*args, kwargs) File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler raise e.with_traceback(filtered_tb) from None File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/execute.py", line 52, in quick_execute tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name, tensorflow.python.framework.errors_impl.UnimplementedError: Graph execution error: Detected at node 'EfficientDet-D1/model/stem_conv2d/Conv2D' defined at (most recent call last): File "/usr/lib/python3.8/threading.py", line 890, in _bootstrap self._bootstrap_inner() File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner self.run() File "/usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py", line 171, in _dummy_computation_fn return _compute_losses_and_predictions_dicts(model, features, labels, File "/usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py", line 124, in _compute_losses_and_predictions_dicts prediction_dict = model.predict( File "/usr/local/lib/python3.8/dist-packages/object_detection/meta_architectures/ssd_meta_arch.py", line 570, in predict if self._feature_extractor.is_keras_model: File "/usr/local/lib/python3.8/dist-packages/object_detection/meta_architectures/ssd_meta_arch.py", line 571, in predict feature_maps = self._feature_extractor(preprocessed_inputs) File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler return fn(*args, *kwargs) File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 558, in call return super().call(args, kwargs) File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler return fn(*args, kwargs) File "/usr/local/lib/python3.8/dist-packages/keras/engine/base_layer.py", line 1145, in call outputs = call_fn(inputs, *args, *kwargs) File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler return fn(args, kwargs) File "/usr/local/lib/python3.8/dist-packages/object_detection/meta_architectures/ssd_meta_arch.py", line 252, in call return self._extract_features(inputs) File "/usr/local/lib/python3.8/dist-packages/object_detection/models/ssd_efficientnet_bifpn_feature_extractor.py", line 234, in _extract_features base_feature_maps = self._efficientnet( File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler return fn(*args, kwargs) File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 558, in call return super().call(*args, *kwargs) File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler return fn(args, kwargs) File "/usr/local/lib/python3.8/dist-packages/keras/engine/base_layer.py", line 1145, in call outputs = call_fn(inputs, *args, kwargs) File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler return fn(*args, *kwargs) File "/usr/local/lib/python3.8/dist-packages/keras/engine/functional.py", line 512, in call return self._run_internal_graph(inputs, training=training, mask=mask) File "/usr/local/lib/python3.8/dist-packages/keras/engine/functional.py", line 669, in _run_internal_graph outputs = node.layer(args, kwargs) File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler return fn(*args, kwargs) File "/usr/local/lib/python3.8/dist-packages/keras/engine/base_layer.py", line 1145, in call outputs = call_fn(inputs, *args, *kwargs) File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler return fn(args, kwargs) File "/usr/local/lib/python3.8/dist-packages/keras/layers/convolutional/base_conv.py", line 290, in call outputs = self.convolution_op(inputs, self.kernel) File "/usr/local/lib/python3.8/dist-packages/keras/layers/convolutional/base_conv.py", line 262, in convolution_op return tf.nn.convolution( Node: 'EfficientDet-D1/model/stem_conv2d/Conv2D' DNN library is not found.

011 [[{{node EfficientDet-D1/model/stem_conv2d/Conv2D}}]] [Op:inferencedummy_computation_fn_30818]

==EVALUATING THE MODEL== /usr/local/lib/python3.8/dist-packages/tensorflow_addons/utils/tfa_eol_msg.py:23: UserWarning: TensorFlow Addons (TFA) has ended development and introduction of new features. TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024. Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). For more information see: https://github.com/tensorflow/addons/issues/2807 warnings.warn( WARNING:tensorflow:Forced number of epochs for all eval validations to be 1. W0415 10:34:20.197691 139818454525760 model_lib_v2.py:1089] Forced number of epochs for all eval validations to be 1. INFO:tensorflow:Maybe overwriting sample_1_of_n_eval_examples: None I0415 10:34:20.197891 139818454525760 config_util.py:552] Maybe overwriting sample_1_of_n_eval_examples: None INFO:tensorflow:Maybe overwriting use_bfloat16: False I0415 10:34:20.197963 139818454525760 config_util.py:552] Maybe overwriting use_bfloat16: False INFO:tensorflow:Maybe overwriting eval_num_epochs: 1 I0415 10:34:20.198034 139818454525760 config_util.py:552] Maybe overwriting eval_num_epochs: 1 WARNING:tensorflow:Expected number of evaluation epochs is 1, but instead encountered eval_on_train_input_config.num_epochs = 0. Overwriting num_epochs to 1. W0415 10:34:20.198128 139818454525760 model_lib_v2.py:1106] Expected number of evaluation epochs is 1, but instead encountered eval_on_train_input_config.num_epochs = 0. Overwriting num_epochs to 1. I0415 10:34:21.334374 139818454525760 ssd_efficientnet_bifpn_feature_extractor.py:150] EfficientDet EfficientNet backbone version: efficientnet-b1 I0415 10:34:21.334518 139818454525760 ssd_efficientnet_bifpn_feature_extractor.py:152] EfficientDet BiFPN num filters: 88 I0415 10:34:21.334567 139818454525760 ssd_efficientnet_bifpn_feature_extractor.py:153] EfficientDet BiFPN num iterations: 4 I0415 10:34:21.338575 139818454525760 efficientnet_model.py:143] round_filter input=32 output=32 I0415 10:34:21.377283 139818454525760 efficientnet_model.py:143] round_filter input=32 output=32 I0415 10:34:21.377415 139818454525760 efficientnet_model.py:143] round_filter input=16 output=16 I0415 10:34:21.544433 139818454525760 efficientnet_model.py:143] round_filter input=16 output=16 I0415 10:34:21.544581 139818454525760 efficientnet_model.py:143] round_filter input=24 output=24 I0415 10:34:21.815359 139818454525760 efficientnet_model.py:143] round_filter input=24 output=24 I0415 10:34:21.815515 139818454525760 efficientnet_model.py:143] round_filter input=40 output=40 I0415 10:34:22.077441 139818454525760 efficientnet_model.py:143] round_filter input=40 output=40 I0415 10:34:22.077582 139818454525760 efficientnet_model.py:143] round_filter input=80 output=80 I0415 10:34:22.423800 139818454525760 efficientnet_model.py:143] round_filter input=80 output=80 I0415 10:34:22.423947 139818454525760 efficientnet_model.py:143] round_filter input=112 output=112 I0415 10:34:22.765852 139818454525760 efficientnet_model.py:143] round_filter input=112 output=112 I0415 10:34:22.766001 139818454525760 efficientnet_model.py:143] round_filter input=192 output=192 I0415 10:34:23.356628 139818454525760 efficientnet_model.py:143] round_filter input=192 output=192 I0415 10:34:23.356780 139818454525760 efficientnet_model.py:143] round_filter input=320 output=320 I0415 10:34:23.542802 139818454525760 efficientnet_model.py:143] round_filter input=1280 output=1280 I0415 10:34:23.585031 139818454525760 efficientnet_model.py:453] Building model efficientnet with params ModelConfig(width_coefficient=1.0, depth_coefficient=1.1, resolution=240, dropout_rate=0.2, blocks=(BlockConfig(input_filters=32, output_filters=16, kernel_size=3, num_repeat=1, expand_ratio=1, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=16, output_filters=24, kernel_size=3, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=24, output_filters=40, kernel_size=5, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=40, output_filters=80, kernel_size=3, num_repeat=3, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=80, output_filters=112, kernel_size=5, num_repeat=3, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=112, output_filters=192, kernel_size=5, num_repeat=4, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=192, output_filters=320, kernel_size=3, num_repeat=1, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise')), stem_base_filters=32, top_base_filters=1280, activation='simple_swish', batch_norm='default', bn_momentum=0.99, bn_epsilon=0.001, weight_decay=5e-06, drop_connect_rate=0.2, depth_divisor=8, min_depth=None, use_se=True, input_channels=3, num_classes=1000, model_name='efficientnet', rescale_input=False, data_format='channels_last', dtype='float32') INFO:tensorflow:Reading unweighted datasets: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/validation.records'] I0415 10:34:23.855814 139818454525760 dataset_builder.py:162] Reading unweighted datasets: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/validation.records'] INFO:tensorflow:Reading record datasets for input file: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/validation.records'] I0415 10:34:23.877571 139818454525760 dataset_builder.py:79] Reading record datasets for input file: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/validation.records'] INFO:tensorflow:Number of filenames to read: 1 I0415 10:34:23.877725 139818454525760 dataset_builder.py:80] Number of filenames to read: 1 WARNING:tensorflow:num_readers has been reduced to 1 to match input file shards. W0415 10:34:23.877783 139818454525760 dataset_builder.py:86] num_readers has been reduced to 1 to match input file shards. WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:100: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.AUTOTUNE) instead. If sloppy execution is desired, use tf.data.Options.deterministic. W0415 10:34:23.885428 139818454525760 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:100: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.AUTOTUNE) instead. If sloppy execution is desired, use tf.data.Options.deterministic. WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:235: DatasetV1.map_with_legacy_function (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.data.Dataset.map() W0415 10:34:23.902438 139818454525760 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:235: DatasetV1.map_with_legacy_function (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version. Instructions for updating: Usetf.data.Dataset.map() WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version. Instructions for updating: Create a tf.sparse.SparseTensor and use tf.sparse.to_dense instead. W0415 10:34:27.488649 139818454525760 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version. Instructions for updating: Create a tf.sparse.SparseTensor and use tf.sparse.to_dense instead. WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.cast instead. W0415 10:34:28.549533 139818454525760 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.cast instead. WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/optimizer_builder.py:124: The name tf.keras.optimizers.SGD is deprecated. Please use tf.keras.optimizers.legacy.SGD instead. W0415 10:34:31.064076 139818454525760 module_wrapper.py:149] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/optimizer_builder.py:124: The name tf.keras.optimizers.SGD is deprecated. Please use tf.keras.optimizers.legacy.SGD instead. INFO:tensorflow:Waiting for new checkpoint at /opt/training I0415 10:34:31.064445 139818454525760 checkpoint_utils.py:168] Waiting for new checkpoint at /opt/training INFO:tensorflow:Timed-out waiting for a checkpoint. I0415 10:34:40.073957 139818454525760 checkpoint_utils.py:231] Timed-out waiting for a checkpoint. ==EXPORTING THE MODEL== /usr/local/lib/python3.8/dist-packages/tensorflow_addons/utils/tfa_eol_msg.py:23: UserWarning: TensorFlow Addons (TFA) has ended development and introduction of new features. TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024. Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). For more information see: https://github.com/tensorflow/addons/issues/2807 warnings.warn( I0415 10:34:45.487078 140492014913344 ssd_efficientnet_bifpn_feature_extractor.py:150] EfficientDet EfficientNet backbone version: efficientnet-b1 I0415 10:34:45.487244 140492014913344 ssd_efficientnet_bifpn_feature_extractor.py:152] EfficientDet BiFPN num filters: 88 I0415 10:34:45.487298 140492014913344 ssd_efficientnet_bifpn_feature_extractor.py:153] EfficientDet BiFPN num iterations: 4 I0415 10:34:45.491338 140492014913344 efficientnet_model.py:143] round_filter input=32 output=32 I0415 10:34:45.531586 140492014913344 efficientnet_model.py:143] round_filter input=32 output=32 I0415 10:34:45.531744 140492014913344 efficientnet_model.py:143] round_filter input=16 output=16 I0415 10:34:45.684208 140492014913344 efficientnet_model.py:143] round_filter input=16 output=16 I0415 10:34:45.684357 140492014913344 efficientnet_model.py:143] round_filter input=24 output=24 I0415 10:34:45.971665 140492014913344 efficientnet_model.py:143] round_filter input=24 output=24 I0415 10:34:45.971818 140492014913344 efficientnet_model.py:143] round_filter input=40 output=40 I0415 10:34:46.260079 140492014913344 efficientnet_model.py:143] round_filter input=40 output=40 I0415 10:34:46.260246 140492014913344 efficientnet_model.py:143] round_filter input=80 output=80 I0415 10:34:46.636266 140492014913344 efficientnet_model.py:143] round_filter input=80 output=80 I0415 10:34:46.636410 140492014913344 efficientnet_model.py:143] round_filter input=112 output=112 I0415 10:34:47.010365 140492014913344 efficientnet_model.py:143] round_filter input=112 output=112 I0415 10:34:47.010508 140492014913344 efficientnet_model.py:143] round_filter input=192 output=192 I0415 10:34:47.476083 140492014913344 efficientnet_model.py:143] round_filter input=192 output=192 I0415 10:34:47.476248 140492014913344 efficientnet_model.py:143] round_filter input=320 output=320 I0415 10:34:47.672554 140492014913344 efficientnet_model.py:143] round_filter input=1280 output=1280 I0415 10:34:47.720135 140492014913344 efficientnet_model.py:453] Building model efficientnet with params ModelConfig(width_coefficient=1.0, depth_coefficient=1.1, resolution=240, dropout_rate=0.2, blocks=(BlockConfig(input_filters=32, output_filters=16, kernel_size=3, num_repeat=1, expand_ratio=1, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=16, output_filters=24, kernel_size=3, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=24, output_filters=40, kernel_size=5, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=40, output_filters=80, kernel_size=3, num_repeat=3, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=80, output_filters=112, kernel_size=5, num_repeat=3, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=112, output_filters=192, kernel_size=5, num_repeat=4, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=192, output_filters=320, kernel_size=3, num_repeat=1, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise')), stem_base_filters=32, top_base_filters=1280, activation='simple_swish', batch_norm='default', bn_momentum=0.99, bn_epsilon=0.001, weight_decay=5e-06, drop_connect_rate=0.2, depth_divisor=8, min_depth=None, use_se=True, input_channels=3, num_classes=1000, model_name='efficientnet', rescale_input=False, data_format='channels_last', dtype='float32') WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/impl/api.py:458: calling map_fn_v2 (from tensorflow.python.ops.map_fn) with back_prop=False is deprecated and will be removed in a future version. Instructions for updating: back_prop=False is deprecated. Consider using tf.stop_gradient instead. Instead of: results = tf.map_fn(fn, elems, back_prop=False) Use: results = tf.nest.map_structure(tf.stop_gradient, tf.map_fn(fn, elems)) W0415 10:34:48.031827 140492014913344 deprecation.py:641] From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/impl/api.py:458: calling map_fn_v2 (from tensorflow.python.ops.map_fn) with back_prop=False is deprecated and will be removed in a future version. Instructions for updating: back_prop=False is deprecated. Consider using tf.stop_gradient instead. Instead of: results = tf.map_fn(fn, elems, back_prop=False) Use: results = tf.nest.map_structure(tf.stop_gradient, tf.map_fn(fn, elems)) I0415 10:34:52.011857 140492014913344 api.py:459] feature_map_spatial_dims: [(80, 80), (40, 40), (20, 20), (10, 10), (5, 5)] I0415 10:35:01.855062 140492014913344 api.py:459] feature_map_spatial_dims: [(80, 80), (40, 40), (20, 20), (10, 10), (5, 5)] Traceback (most recent call last): File "exporter_main_v2.py", line 164, in app.run(main) File "/usr/local/lib/python3.8/dist-packages/absl/app.py", line 308, in run _run_main(main, args) File "/usr/local/lib/python3.8/dist-packages/absl/app.py", line 254, in _run_main sys.exit(main(argv)) File "exporter_main_v2.py", line 157, in main exporter_lib_v2.export_inference_graph( File "/usr/local/lib/python3.8/dist-packages/object_detection/exporter_lib_v2.py", line 271, in export_inference_graph status.assert_existing_objects_matched() File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/checkpoint/checkpoint.py", line 955, in assert_existing_objects_matched raise AssertionError( AssertionError: No checkpoint specified (save_path=None); nothing is being restored. mv: cannot stat '/tmp/exported/saved_model': No such file or directory 2023-04-15 10:35:05,585 sagemaker-training-toolkit ERROR Reporting training FAILURE 2023-04-15 10:35:05,585 sagemaker-training-toolkit ERROR ExecuteUserScriptError: ExitCode 1 ErrorMessage "" Command "/bin/sh -c ./run_training.sh --model_dir /opt/training --num_train_steps 1000 --pipeline_config_path pipeline.config --sample_1_of_n_eval_examples 1" 2023-04-15 10:35:05,585 sagemaker-training-toolkit ERROR Encountered exit_code 1

2023-04-15 10:35:22 Uploading - Uploading generated training model 2023-04-15 10:35:22 Failed - Training job failed

UnexpectedStatusException Traceback (most recent call last) /tmp/ipykernel_10180/1459479382.py in <cell line: 1>() ----> 1 estimator.fit(inputs)

~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/sagemaker/workflow/pipeline_context.py in wrapper(*args, kwargs) 270 return _StepArguments(retrieve_caller_name(self_instance), run_func, *args, *kwargs) 271 --> 272 return run_func(args, kwargs) 273 274 return wrapper

~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name, experiment_config) 1161 self.jobs.append(self.latest_training_job) 1162 if wait: -> 1163 self.latest_training_job.wait(logs=logs) 1164 1165 def _compilation_job_name(self):

~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/sagemaker/estimator.py in wait(self, logs) 2309 # If logs are requested, call logs_for_jobs. 2310 if logs != "None": -> 2311 self.sagemaker_session.logs_for_job(self.job_name, wait=True, log_type=logs) 2312 else: 2313 self.sagemaker_session.wait_for_job(self.job_name)

~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/sagemaker/session.py in logs_for_job(self, job_name, wait, poll, log_type) 4174 4175 if wait: -> 4176 self._check_job_status(job_name, description, "TrainingJobStatus") 4177 if dot: 4178 print()

~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/sagemaker/session.py in _check_job_status(self, job, desc, status_key_name) 3705 actual_status=status, 3706 ) -> 3707 raise exceptions.UnexpectedStatusException( 3708 message=message, 3709 allowed_statuses=["Completed", "Stopped"],

UnexpectedStatusException: Error for Training job tf2-object-detection-2023-04-15-10-29-55-436: Failed. Reason: AlgorithmError: ExecuteUserScriptError: ExitCode 1 ErrorMessage "" Command "/bin/sh -c ./run_training.sh --model_dir /opt/training --num_train_steps 1000 --pipeline_config_path pipeline.config --sample_1_of_n_eval_examples 1", exit code: 1

I am not able to understand this error please help me to figure this out.

QuanNguyenAUT commented 1 year ago

Hi team, I have the same issue. I did run it successfully before but it does not now. Thanks

danilpolkov commented 1 year ago

me too, I guess it is because of Node: 'EfficientDet-D1/model/stem_conv2d/Conv2D' DNN library is not found.

I have the same issue with ssd_mobilenet.

I even tried to cook my own docker file with cuda, and cudnn base image, but this got the same issue