Open victormico opened 2 years ago
I am trying to run the tensorflow_script_mode_california_housing_local_training_and_serving.py file following the instructions on the readme.md
tensorflow_script_mode_california_housing_local_training_and_serving.py
I am runing this on a Ubuntu 20.04 using WSL.
With
python --version Python 3.8.10 pip --version pip 20.0.2
python --version Python 3.8.10
pip --version pip 20.0.2
However I get the following error in the line tensorflow_serving_transformer.transform:
tensorflow_serving_transformer.transform
Training and evaluation datasets exist. Skipping Download Starting model training. Note: if launching for the first time in local mode, container image download might take a few minutes to complete. Creating vmvgprkqsv-algo-1-owkbo ... Creating vmvgprkqsv-algo-1-owkbo ... done Attaching to vmvgprkqsv-algo-1-owkbo vmvgprkqsv-algo-1-owkbo | 2022-07-06 12:45:28.389076: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler. vmvgprkqsv-algo-1-owkbo | 2022-07-06 12:45:28.389228: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped. vmvgprkqsv-algo-1-owkbo | 2022-07-06 12:45:28.415723: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler. vmvgprkqsv-algo-1-owkbo | 2022-07-06 12:45:30,302 sagemaker-training-toolkit INFO Imported framework sagemaker_tensorflow_container.training vmvgprkqsv-algo-1-owkbo | 2022-07-06 12:45:30,312 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) vmvgprkqsv-algo-1-owkbo | 2022-07-06 12:45:30,345 botocore.credentials INFO Found credentials in environment variables. vmvgprkqsv-algo-1-owkbo | 2022-07-06 12:45:31,715 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) vmvgprkqsv-algo-1-owkbo | 2022-07-06 12:45:31,736 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) vmvgprkqsv-algo-1-owkbo | 2022-07-06 12:45:31,758 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) vmvgprkqsv-algo-1-owkbo | 2022-07-06 12:45:31,771 sagemaker-training-toolkit INFO Invoking user script vmvgprkqsv-algo-1-owkbo | vmvgprkqsv-algo-1-owkbo | Training Env: vmvgprkqsv-algo-1-owkbo | vmvgprkqsv-algo-1-owkbo | { vmvgprkqsv-algo-1-owkbo | "additional_framework_parameters": {}, vmvgprkqsv-algo-1-owkbo | "channel_input_dirs": { vmvgprkqsv-algo-1-owkbo | "train": "/opt/ml/input/data/train", vmvgprkqsv-algo-1-owkbo | "test": "/opt/ml/input/data/test" vmvgprkqsv-algo-1-owkbo | }, vmvgprkqsv-algo-1-owkbo | "current_host": "algo-1-owkbo", vmvgprkqsv-algo-1-owkbo | "framework_module": "sagemaker_tensorflow_container.training:main", vmvgprkqsv-algo-1-owkbo | "hosts": [ vmvgprkqsv-algo-1-owkbo | "algo-1-owkbo" vmvgprkqsv-algo-1-owkbo | ], vmvgprkqsv-algo-1-owkbo | "hyperparameters": { vmvgprkqsv-algo-1-owkbo | "model_dir": "s3://sagemaker-eu-west-1-433829917051/tensorflow-training-2022-07-06-12-45-24-608/model" vmvgprkqsv-algo-1-owkbo | }, vmvgprkqsv-algo-1-owkbo | "input_config_dir": "/opt/ml/input/config", vmvgprkqsv-algo-1-owkbo | "input_data_config": { vmvgprkqsv-algo-1-owkbo | "train": { vmvgprkqsv-algo-1-owkbo | "TrainingInputMode": "File" vmvgprkqsv-algo-1-owkbo | }, vmvgprkqsv-algo-1-owkbo | "test": { vmvgprkqsv-algo-1-owkbo | "TrainingInputMode": "File" vmvgprkqsv-algo-1-owkbo | } vmvgprkqsv-algo-1-owkbo | }, vmvgprkqsv-algo-1-owkbo | "input_dir": "/opt/ml/input", vmvgprkqsv-algo-1-owkbo | "is_master": true, vmvgprkqsv-algo-1-owkbo | "is_modelparallel_enabled": null, vmvgprkqsv-algo-1-owkbo | "job_name": "tensorflow-training-2022-07-06-12-45-24-608", vmvgprkqsv-algo-1-owkbo | "log_level": 20, vmvgprkqsv-algo-1-owkbo | "master_hostname": "algo-1-owkbo", vmvgprkqsv-algo-1-owkbo | "model_dir": "/opt/ml/model", vmvgprkqsv-algo-1-owkbo | "module_dir": "s3://sagemaker-eu-west-1-433829917051/tensorflow-training-2022-07-06-12-45-24-608/source/sourcedir.tar.gz", vmvgprkqsv-algo-1-owkbo | "module_name": "california_housing_tf2", vmvgprkqsv-algo-1-owkbo | "network_interface_name": "eth0", vmvgprkqsv-algo-1-owkbo | "num_cpus": 12, vmvgprkqsv-algo-1-owkbo | "num_gpus": 0, vmvgprkqsv-algo-1-owkbo | "output_data_dir": "/opt/ml/output/data", vmvgprkqsv-algo-1-owkbo | "output_dir": "/opt/ml/output", vmvgprkqsv-algo-1-owkbo | "output_intermediate_dir": "/opt/ml/output/intermediate", vmvgprkqsv-algo-1-owkbo | "resource_config": { vmvgprkqsv-algo-1-owkbo | "current_host": "algo-1-owkbo", vmvgprkqsv-algo-1-owkbo | "hosts": [ vmvgprkqsv-algo-1-owkbo | "algo-1-owkbo" vmvgprkqsv-algo-1-owkbo | ] vmvgprkqsv-algo-1-owkbo | }, vmvgprkqsv-algo-1-owkbo | "user_entry_point": "california_housing_tf2.py" vmvgprkqsv-algo-1-owkbo | } vmvgprkqsv-algo-1-owkbo | vmvgprkqsv-algo-1-owkbo | Environment variables: vmvgprkqsv-algo-1-owkbo | vmvgprkqsv-algo-1-owkbo | SM_HOSTS=["algo-1-owkbo"] vmvgprkqsv-algo-1-owkbo | SM_NETWORK_INTERFACE_NAME=eth0 vmvgprkqsv-algo-1-owkbo | SM_HPS={"model_dir":"s3://sagemaker-eu-west-1-433829917051/tensorflow-training-2022-07-06-12-45-24-608/model"} vmvgprkqsv-algo-1-owkbo | SM_USER_ENTRY_POINT=california_housing_tf2.py vmvgprkqsv-algo-1-owkbo | SM_FRAMEWORK_PARAMS={} vmvgprkqsv-algo-1-owkbo | SM_RESOURCE_CONFIG={"current_host":"algo-1-owkbo","hosts":["algo-1-owkbo"]} vmvgprkqsv-algo-1-owkbo | SM_INPUT_DATA_CONFIG={"test":{"TrainingInputMode":"File"},"train":{"TrainingInputMode":"File"}} vmvgprkqsv-algo-1-owkbo | SM_OUTPUT_DATA_DIR=/opt/ml/output/data vmvgprkqsv-algo-1-owkbo | SM_CHANNELS=["test","train"] vmvgprkqsv-algo-1-owkbo | SM_CURRENT_HOST=algo-1-owkbo vmvgprkqsv-algo-1-owkbo | SM_MODULE_NAME=california_housing_tf2 vmvgprkqsv-algo-1-owkbo | SM_LOG_LEVEL=20 vmvgprkqsv-algo-1-owkbo | SM_FRAMEWORK_MODULE=sagemaker_tensorflow_container.training:main vmvgprkqsv-algo-1-owkbo | SM_INPUT_DIR=/opt/ml/input vmvgprkqsv-algo-1-owkbo | SM_INPUT_CONFIG_DIR=/opt/ml/input/config vmvgprkqsv-algo-1-owkbo | SM_OUTPUT_DIR=/opt/ml/output vmvgprkqsv-algo-1-owkbo | SM_NUM_CPUS=12 vmvgprkqsv-algo-1-owkbo | SM_NUM_GPUS=0 vmvgprkqsv-algo-1-owkbo | SM_MODEL_DIR=/opt/ml/model vmvgprkqsv-algo-1-owkbo | SM_MODULE_DIR=s3://sagemaker-eu-west-1-433829917051/tensorflow-training-2022-07-06-12-45-24-608/source/sourcedir.tar.gz vmvgprkqsv-algo-1-owkbo | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{"test":"/opt/ml/input/data/test","train":"/opt/ml/input/data/train"},"current_host":"algo-1-owkbo","framework_module":"sagemaker_tensorflow_container.training:main","hosts":["algo-1-owkbo"],"hyperparameters":{"model_dir":"s3://sagemaker-eu-west-1-433829917051/tensorflow-training-2022-07-06-12-45-24-608/model"},"input_config_dir":"/opt/ml/input/config","input_data_config":{"test":{"TrainingInputMode":"File"},"train":{"TrainingInputMode":"File"}},"input_dir":"/opt/ml/input","is_master":true,"is_modelparallel_enabled":null,"job_name":"tensorflow-training-2022-07-06-12-45-24-608","log_level":20,"master_hostname":"algo-1-owkbo","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-eu-west-1-433829917051/tensorflow-training-2022-07-06-12-45-24-608/source/sourcedir.tar.gz","module_name":"california_housing_tf2","network_interface_name":"eth0","num_cpus":12,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1-owkbo","hosts":["algo-1-owkbo"]},"user_entry_point":"california_housing_tf2.py"} vmvgprkqsv-algo-1-owkbo | SM_USER_ARGS=["--model_dir","s3://sagemaker-eu-west-1-433829917051/tensorflow-training-2022-07-06-12-45-24-608/model"] vmvgprkqsv-algo-1-owkbo | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate vmvgprkqsv-algo-1-owkbo | SM_CHANNEL_TRAIN=/opt/ml/input/data/train vmvgprkqsv-algo-1-owkbo | SM_CHANNEL_TEST=/opt/ml/input/data/test vmvgprkqsv-algo-1-owkbo | SM_HP_MODEL_DIR=s3://sagemaker-eu-west-1-433829917051/tensorflow-training-2022-07-06-12-45-24-608/model vmvgprkqsv-algo-1-owkbo | PYTHONPATH=/opt/ml/code:/usr/local/bin:/usr/local/lib/python39.zip:/usr/local/lib/python3.9:/usr/local/lib/python3.9/lib-dynload:/usr/local/lib/python3.9/site-packages:/usr/local/lib/python3.9/site-packages/smdebug-1.0.14b20220624-py3.9.egg:/usr/local/lib/python3.9/site-packages/pyinstrument-3.4.2-py3.9.egg:/usr/local/lib/python3.9/site-packages/pyinstrument_cext-0.2.4-py3.9-linux-x86_64.egg vmvgprkqsv-algo-1-owkbo | vmvgprkqsv-algo-1-owkbo | Invoking script with the following command: vmvgprkqsv-algo-1-owkbo | vmvgprkqsv-algo-1-owkbo | /usr/local/bin/python3.9 california_housing_tf2.py --model_dir s3://sagemaker-eu-west-1-433829917051/tensorflow-training-2022-07-06-12-45-24-608/model vmvgprkqsv-algo-1-owkbo | vmvgprkqsv-algo-1-owkbo | vmvgprkqsv-algo-1-owkbo | 2022-07-06 12:45:32.483260: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler. vmvgprkqsv-algo-1-owkbo | 2022-07-06 12:45:32.483393: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped. vmvgprkqsv-algo-1-owkbo | 2022-07-06 12:45:32.511418: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler. vmvgprkqsv-algo-1-owkbo | Training data location: /opt/ml/input/data/train vmvgprkqsv-algo-1-owkbo | Test data location: /opt/ml/input/data/test vmvgprkqsv-algo-1-owkbo | x train (13827, 8) y train (13827, 1) vmvgprkqsv-algo-1-owkbo | x test (6811, 8) y test (6811, 1) vmvgprkqsv-algo-1-owkbo | batch_size = 64, epochs = 1, learning rate = 0.1 vmvgprkqsv-algo-1-owkbo | Extension horovod.torch has not been built: /usr/local/lib/python3.9/site-packages/horovod/torch/mpi_lib/_mpi_lib.cpython-39-x86_64-linux-gnu.so not found vmvgprkqsv-algo-1-owkbo | If this is not expected, reinstall Horovod with HOROVOD_WITH_PYTORCH=1 to debug the build error. vmvgprkqsv-algo-1-owkbo | Warning! MPI libs are missing, but python applications are still avaiable. vmvgprkqsv-algo-1-owkbo | [2022-07-06 12:45:34.423 05b1aaa32ec8:44 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None vmvgprkqsv-algo-1-owkbo | /usr/local/lib/python3.9/site-packages/smdebug-1.0.14b20220624-py3.9.egg/smdebug/profiler/system_metrics_reader.py:63: SyntaxWarning: "is not" with a literal. Did you mean "!="? vmvgprkqsv-algo-1-owkbo | /usr/local/lib/python3.9/site-packages/smdebug-1.0.14b20220624-py3.9.egg/smdebug/profiler/system_metrics_reader.py:63: SyntaxWarning: "is not" with a literal. Did you mean "!="? vmvgprkqsv-algo-1-owkbo | [2022-07-06 12:45:34.655 05b1aaa32ec8:44 INFO profiler_config_parser.py:111] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled. vmvgprkqsv-algo-1-owkbo | 1/217 [..............................] - ETA: 2:28 - loss: 3.5547 16/217 [=>............................] - ETA: 0s - loss: 1.4032 35/217 [===>..........................] - ETA: 0s - loss: 1.0644 53/217 [======>.......................] - ETA: 0s - loss: 0.9143 73/217 [=========>....................] - ETA: 0s - loss: 0.8041 92/217 [===========>..................] - ETA: 0s - loss: 0.7364 115/217 [==============>...............] - ETA: 0s - loss: 0.6733 131/217 [=================>............] - ETA: 0s - loss: 0.6425 150/217 [===================>..........] - ETA: 0s - loss: 0.6107 169/217 [======================>.......] - ETA: 0s - loss: 0.5877 187/217 [========================>.....] - ETA: 0s - loss: 0.5700 203/217 [===========================>..] - ETA: 0s - loss: 0.5574 217/217 [==============================] - 2s 4ms/step - loss: 0.5460 - val_loss: 0.4188 vmvgprkqsv-algo-1-owkbo | 107/107 - 0s - loss: 0.4188 - 176ms/epoch - 2ms/step vmvgprkqsv-algo-1-owkbo | Test MSE : 0.41876718401908875 vmvgprkqsv-algo-1-owkbo | 2022-07-06 12:45:36.451985: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them. vmvgprkqsv-algo-1-owkbo | INFO:tensorflow:Assets written to: /opt/ml/model/1/assets vmvgprkqsv-algo-1-owkbo | INFO:tensorflow:Assets written to: /opt/ml/model/1/assets vmvgprkqsv-algo-1-owkbo | 2022-07-06 12:45:37,095 sagemaker-training-toolkit INFO Waiting for the process to finish and give a return code. vmvgprkqsv-algo-1-owkbo | 2022-07-06 12:45:37,095 sagemaker-training-toolkit INFO Done waiting for a return code. Received 0 from exiting process. vmvgprkqsv-algo-1-owkbo | 2022-07-06 12:45:37,096 sagemaker-training-toolkit INFO Reporting training SUCCESS vmvgprkqsv-algo-1-owkbo exited with code 0 Aborting on container exit... ===== Job Complete ===== Completed model training Running Batch Transform in local mode Exception in thread Thread-1: Traceback (most recent call last): File "/home/victor/venvs/sagemaker/lib/python3.8/site-packages/sagemaker/local/image.py", line 852, in run _stream_output(self.process) File "/home/victor/venvs/sagemaker/lib/python3.8/site-packages/sagemaker/local/image.py", line 914, in _stream_output raise RuntimeError("Process exited with code: %s" % exit_code) RuntimeError: Process exited with code: 1 During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner self.run() File "/home/victor/venvs/sagemaker/lib/python3.8/site-packages/sagemaker/local/image.py", line 857, in run raise RuntimeError(msg) RuntimeError: Failed to run: ['docker-compose', '-f', '/tmp/tmpf2pz5wvj/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1
Any idea on how to deal with this issue?
Many thanks!
Hi @victormico, what is the SageMaker SDK version you are using?
I am using sagemaker 2.94.0.
I am trying to run the
tensorflow_script_mode_california_housing_local_training_and_serving.py
file following the instructions on the readme.mdI am runing this on a Ubuntu 20.04 using WSL.
With
However I get the following error in the line
tensorflow_serving_transformer.transform
:Any idea on how to deal with this issue?
Many thanks!