Describe the bug
When I run estimator.fit() in the "Local training: Fit" section, I get this error:
time="2023-10-11T09:42:45Z" level=warning msg="a network with name sagemaker-local exists but was not created by compose.\nSetexternal: trueto use an existing network" network sagemaker-local was found but has incorrect label com.docker.compose.network set to ""
To reproduce
Create a notebook instance from Sagemaker, and run the cells in the example until this section. I'm using region = London, fyi.
Logs
time="2023-10-11T09:42:45Z" level=warning msg="a network with name sagemaker-local exists but was not created by compose.\nSet `external: true` to use an existing network"
network sagemaker-local was found but has incorrect label com.docker.compose.network set to ""
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/local/image.py:255, in _SageMakerContainer.train(self, input_data_config, output_data_config, hyperparameters, environment, job_name)
254 try:
--> 255 _stream_output(process)
256 except RuntimeError as e:
257 # _stream_output() doesn't have the command line. We will handle the exception
258 # which contains the exit code and append the command line to it.
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/local/image.py:928, in _stream_output(process)
927 if exit_code != 0:
--> 928 raise RuntimeError("Process exited with code: %s" % exit_code)
930 return exit_code
RuntimeError: Process exited with code: 1
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
Cell In[15], line 17
15 # launch training job
16 print(f"file://{training_file_path}")
---> 17 estimator.fit(f"file://{training_file_path}")
18 # estimator
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/workflow/pipeline_context.py:311, in runnable_by_pipeline.<locals>.wrapper(*args, **kwargs)
307 return context
309 return _StepArguments(retrieve_caller_name(self_instance), run_func, *args, **kwargs)
--> 311 return run_func(*args, **kwargs)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/estimator.py:1307, in EstimatorBase.fit(self, inputs, wait, logs, job_name, experiment_config)
1304 self._prepare_for_training(job_name=job_name)
1306 experiment_config = check_and_get_run_experiment_config(experiment_config)
-> 1307 self.latest_training_job = _TrainingJob.start_new(self, inputs, experiment_config)
1308 self.jobs.append(self.latest_training_job)
1309 if wait:
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/estimator.py:2361, in _TrainingJob.start_new(cls, estimator, inputs, experiment_config)
2336 """Create a new Amazon SageMaker training job from the estimator.
2337
2338 Args:
(...)
2357 all information about the started training job.
2358 """
2359 train_args = cls._get_train_args(estimator, inputs, experiment_config)
-> 2361 estimator.sagemaker_session.train(**train_args)
2363 return cls(estimator.sagemaker_session, estimator._current_job_name)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/session.py:892, in Session.train(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image_uri, training_image_config, container_entry_point, container_arguments, algorithm_arn, encrypt_inter_container_traffic, use_spot_instances, checkpoint_s3_uri, checkpoint_local_path, experiment_config, debugger_rule_configs, debugger_hook_config, tensorboard_output_config, enable_sagemaker_metrics, profiler_rule_configs, profiler_config, environment, retry_strategy)
889 LOGGER.debug("train request: %s", json.dumps(request, indent=4))
890 self.sagemaker_client.create_training_job(**request)
--> 892 self._intercept_create_request(train_request, submit, self.train.__name__)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/session.py:5494, in Session._intercept_create_request(self, request, create, func_name)
5477 def _intercept_create_request(
5478 self,
5479 request: typing.Dict,
(...)
5482 # pylint: disable=unused-argument
5483 ):
5484 """This function intercepts the create job request.
5485
5486 PipelineSession inherits this Session class and will override
(...)
5492 func_name (str): the name of the function needed intercepting
5493 """
-> 5494 return create(request)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/session.py:890, in Session.train.<locals>.submit(request)
888 LOGGER.info("Creating training-job with name: %s", job_name)
889 LOGGER.debug("train request: %s", json.dumps(request, indent=4))
--> 890 self.sagemaker_client.create_training_job(**request)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/local/local_session.py:200, in LocalSagemakerClient.create_training_job(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, Environment, **kwargs)
198 hyperparameters = kwargs["HyperParameters"] if "HyperParameters" in kwargs else {}
199 logger.info("Starting training job")
--> 200 training_job.start(
201 InputDataConfig, OutputDataConfig, hyperparameters, Environment, TrainingJobName
202 )
204 LocalSagemakerClient._training_jobs[TrainingJobName] = training_job
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/local/entities.py:243, in _LocalTrainingJob.start(self, input_data_config, output_data_config, hyperparameters, environment, job_name)
240 self.state = self._TRAINING
241 self.environment = environment
--> 243 self.model_artifacts = self.container.train(
244 input_data_config, output_data_config, hyperparameters, environment, job_name
245 )
246 self.end_time = datetime.datetime.now()
247 self.state = self._COMPLETED
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/local/image.py:260, in _SageMakerContainer.train(self, input_data_config, output_data_config, hyperparameters, environment, job_name)
256 except RuntimeError as e:
257 # _stream_output() doesn't have the command line. We will handle the exception
258 # which contains the exit code and append the command line to it.
259 msg = "Failed to run: %s, %s" % (compose_command, str(e))
--> 260 raise RuntimeError(msg)
261 finally:
262 artifacts = self.retrieve_artifacts(compose_data, output_data_config, job_name)
RuntimeError: Failed to run: ['docker-compose', '-f', '/tmp/tmpqdq3z5i7/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1
Link to the notebook https://github.com/aws/amazon-sagemaker-examples/blob/main/advanced_functionality/pytorch_extend_container_train_deploy_bertopic/BERTtopic_extending_container.ipynb
Describe the bug When I run estimator.fit() in the "Local training: Fit" section, I get this error:
time="2023-10-11T09:42:45Z" level=warning msg="a network with name sagemaker-local exists but was not created by compose.\nSet
external: trueto use an existing network" network sagemaker-local was found but has incorrect label com.docker.compose.network set to ""
To reproduce Create a notebook instance from Sagemaker, and run the cells in the example until this section. I'm using region = London, fyi.
Logs