aws / amazon-sagemaker-examples

Example 📓 Jupyter notebooks that demonstrate how to build, train, and deploy machine learning models using 🧠 Amazon SageMaker.
https://sagemaker-examples.readthedocs.io
Apache License 2.0
10.08k stars 6.76k forks source link

[Bug Report]: Bertopic example notebook unable to fit estimator #4453

Open noahberhe opened 1 year ago

noahberhe commented 1 year ago

Link to the notebook https://github.com/aws/amazon-sagemaker-examples/blob/main/advanced_functionality/pytorch_extend_container_train_deploy_bertopic/BERTtopic_extending_container.ipynb

Describe the bug When I run estimator.fit() in the "Local training: Fit" section, I get this error: time="2023-10-11T09:42:45Z" level=warning msg="a network with name sagemaker-local exists but was not created by compose.\nSetexternal: trueto use an existing network" network sagemaker-local was found but has incorrect label com.docker.compose.network set to ""

To reproduce Create a notebook instance from Sagemaker, and run the cells in the example until this section. I'm using region = London, fyi.

Logs

time="2023-10-11T09:42:45Z" level=warning msg="a network with name sagemaker-local exists but was not created by compose.\nSet `external: true` to use an existing network"
network sagemaker-local was found but has incorrect label com.docker.compose.network set to ""
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/local/image.py:255, in _SageMakerContainer.train(self, input_data_config, output_data_config, hyperparameters, environment, job_name)
    254 try:
--> 255     _stream_output(process)
    256 except RuntimeError as e:
    257     # _stream_output() doesn't have the command line. We will handle the exception
    258     # which contains the exit code and append the command line to it.

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/local/image.py:928, in _stream_output(process)
    927 if exit_code != 0:
--> 928     raise RuntimeError("Process exited with code: %s" % exit_code)
    930 return exit_code

RuntimeError: Process exited with code: 1

During handling of the above exception, another exception occurred:

RuntimeError                              Traceback (most recent call last)
Cell In[15], line 17
     15 # launch training job
     16 print(f"file://{training_file_path}")
---> 17 estimator.fit(f"file://{training_file_path}")
     18 # estimator

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/workflow/pipeline_context.py:311, in runnable_by_pipeline.<locals>.wrapper(*args, **kwargs)
    307         return context
    309     return _StepArguments(retrieve_caller_name(self_instance), run_func, *args, **kwargs)
--> 311 return run_func(*args, **kwargs)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/estimator.py:1307, in EstimatorBase.fit(self, inputs, wait, logs, job_name, experiment_config)
   1304 self._prepare_for_training(job_name=job_name)
   1306 experiment_config = check_and_get_run_experiment_config(experiment_config)
-> 1307 self.latest_training_job = _TrainingJob.start_new(self, inputs, experiment_config)
   1308 self.jobs.append(self.latest_training_job)
   1309 if wait:

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/estimator.py:2361, in _TrainingJob.start_new(cls, estimator, inputs, experiment_config)
   2336 """Create a new Amazon SageMaker training job from the estimator.
   2337 
   2338 Args:
   (...)
   2357     all information about the started training job.
   2358 """
   2359 train_args = cls._get_train_args(estimator, inputs, experiment_config)
-> 2361 estimator.sagemaker_session.train(**train_args)
   2363 return cls(estimator.sagemaker_session, estimator._current_job_name)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/session.py:892, in Session.train(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image_uri, training_image_config, container_entry_point, container_arguments, algorithm_arn, encrypt_inter_container_traffic, use_spot_instances, checkpoint_s3_uri, checkpoint_local_path, experiment_config, debugger_rule_configs, debugger_hook_config, tensorboard_output_config, enable_sagemaker_metrics, profiler_rule_configs, profiler_config, environment, retry_strategy)
    889     LOGGER.debug("train request: %s", json.dumps(request, indent=4))
    890     self.sagemaker_client.create_training_job(**request)
--> 892 self._intercept_create_request(train_request, submit, self.train.__name__)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/session.py:5494, in Session._intercept_create_request(self, request, create, func_name)
   5477 def _intercept_create_request(
   5478     self,
   5479     request: typing.Dict,
   (...)
   5482     # pylint: disable=unused-argument
   5483 ):
   5484     """This function intercepts the create job request.
   5485 
   5486     PipelineSession inherits this Session class and will override
   (...)
   5492         func_name (str): the name of the function needed intercepting
   5493     """
-> 5494     return create(request)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/session.py:890, in Session.train.<locals>.submit(request)
    888 LOGGER.info("Creating training-job with name: %s", job_name)
    889 LOGGER.debug("train request: %s", json.dumps(request, indent=4))
--> 890 self.sagemaker_client.create_training_job(**request)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/local/local_session.py:200, in LocalSagemakerClient.create_training_job(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, Environment, **kwargs)
    198 hyperparameters = kwargs["HyperParameters"] if "HyperParameters" in kwargs else {}
    199 logger.info("Starting training job")
--> 200 training_job.start(
    201     InputDataConfig, OutputDataConfig, hyperparameters, Environment, TrainingJobName
    202 )
    204 LocalSagemakerClient._training_jobs[TrainingJobName] = training_job

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/local/entities.py:243, in _LocalTrainingJob.start(self, input_data_config, output_data_config, hyperparameters, environment, job_name)
    240 self.state = self._TRAINING
    241 self.environment = environment
--> 243 self.model_artifacts = self.container.train(
    244     input_data_config, output_data_config, hyperparameters, environment, job_name
    245 )
    246 self.end_time = datetime.datetime.now()
    247 self.state = self._COMPLETED

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/local/image.py:260, in _SageMakerContainer.train(self, input_data_config, output_data_config, hyperparameters, environment, job_name)
    256 except RuntimeError as e:
    257     # _stream_output() doesn't have the command line. We will handle the exception
    258     # which contains the exit code and append the command line to it.
    259     msg = "Failed to run: %s, %s" % (compose_command, str(e))
--> 260     raise RuntimeError(msg)
    261 finally:
    262     artifacts = self.retrieve_artifacts(compose_data, output_data_config, job_name)

RuntimeError: Failed to run: ['docker-compose', '-f', '/tmp/tmpqdq3z5i7/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1
aaglietti-itsrizzoli commented 11 months ago

docker network prune in my case solved the issue!

ccaam1 commented 11 months ago

docker network prune in my case solved the issue!

This worked for me too. Thanks!