Open anadeem445 opened 1 month ago
@anadeem445 This issue seems to be due to an incompatibility or a change in the SageMaker SDK. you might follow these steps and check if the issue persists:-
predictor_cls
from Model Initialization: If the predictor_cls
argument is not required during model creation, remove it from the Model class initialization and only use it during deployment.predictor_cls
Usage: Modify the code to ensure predictor_cls
is only used in the deploy method if needed.
sample snippet:-
model = Model(
image_uri=deploy_image_uri,
source_dir=deploy_source_uri,
model_data=base_model_uri,
entry_point="inference.py",
role=aws_role,
name=endpoint_name,
)
base_model_predictor = model.deploy( initial_instance_count=1, instance_type=inference_instance_type, predictor_cls=Predictor, endpoint_name=endpoint_name, )
plz let me know, if the above helps
Thanks
Hello 👋 . I am sorry for hijacking the thread but found the same error as above. I've removed the predictor_cls from the model initialization and kept it in the deploy (as in the snippet above).
The error remains the same.
@catica The error you're encountering, TypeError: Model._create_sagemaker_model()
got an unexpected keyword argument 'predictor_cls', indicates that the Model.deploy method does not accept the predictor_cls parameter.
modify your code to exclude the predictor_cls argument from the model.deploy method. Instead, after deploying the model, you can instantiate the Predictor class separately.
sample snippet:-
from sagemaker import image_uris, model_uris, script_uris, hyperparameters
from sagemaker.model import Model
from sagemaker.predictor import Predictor
from sagemaker.utils import name_from_base
# model_version="*" fetches the latest version of the model
infer_model_id, infer_model_version = infer_model_dropdown.value, "*"
endpoint_name = name_from_base(f"jumpstart-example-infer-{infer_model_id}")
inference_instance_type = "ml.p2.xlarge"
# Retrieve the inference docker container uri
deploy_image_uri = image_uris.retrieve(
region=None,
framework=None, # automatically inferred from model_id
image_scope="inference",
model_id=infer_model_id,
model_version=infer_model_version,
instance_type=inference_instance_type,
)
# Retrieve the inference script uri. This includes scripts for model loading, inference handling, etc.
deploy_source_uri = script_uris.retrieve(
model_id=infer_model_id, model_version=infer_model_version, script_scope="inference"
)
# Retrieve the base model uri
base_model_uri = model_uris.retrieve(
model_id=infer_model_id, model_version=infer_model_version, model_scope="inference"
)
# Create the SageMaker model instance
model = Model(
image_uri=deploy_image_uri,
source_dir=deploy_source_uri,
model_data=base_model_uri,
entry_point="inference.py", # entry point file in source_dir and present in deploy_source_uri
role=aws_role,
name=endpoint_name,
)
# Deploy the Model without predictor_cls argument
base_model_predictor = model.deploy(
initial_instance_count=1,
instance_type=inference_instance_type,
endpoint_name=endpoint_name,
)
# Instantiate the Predictor class separately
predictor = Predictor(endpoint_name=endpoint_name)
Let me know, if it works Thanks
Thanks for your quick reply @Siddharth-Latthe-07 .
I've got a different error:
---------------------------------------------------------------------------
ClientError Traceback (most recent call last)
Cell In[13], line 2
1 # Deploy the Model without predictor_cls argument
----> 2 base_model_predictor = model.deploy(
3 initial_instance_count=1,
4 instance_type=inference_instance_type,
5 endpoint_name=endpoint_name,
6 )
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/model.py:1692, in Model.deploy(self, initial_instance_count, instance_type, serializer, deserializer, accelerator_type, endpoint_name, tags, kms_key, wait, data_capture_config, async_inference_config, serverless_inference_config, volume_size, model_data_download_timeout, container_startup_health_check_timeout, inference_recommendation_id, explainer_config, accept_eula, endpoint_logging, resources, endpoint_type, managed_instance_scaling, inference_component_name, routing_config, **kwargs)
1689 return None
1691 else: # existing single model endpoint path
-> 1692 self._create_sagemaker_model(
1693 instance_type=instance_type,
1694 accelerator_type=accelerator_type,
1695 tags=tags,
1696 serverless_inference_config=serverless_inference_config,
1697 **kwargs,
1698 )
1699 serverless_inference_config_dict = (
1700 serverless_inference_config._to_request_dict() if is_serverless else None
1701 )
1702 production_variant = sagemaker.production_variant(
1703 self.name,
1704 instance_type,
(...)
1711 routing_config=routing_config,
1712 )
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/model.py:927, in Model._create_sagemaker_model(self, instance_type, accelerator_type, tags, serverless_inference_config, accept_eula, model_reference_arn)
925 self.name = model_package.name
926 else:
--> 927 container_def = self.prepare_container_def(
928 instance_type,
929 accelerator_type=accelerator_type,
930 serverless_inference_config=serverless_inference_config,
931 accept_eula=accept_eula,
932 model_reference_arn=model_reference_arn,
933 )
935 if not isinstance(self.sagemaker_session, PipelineSession):
936 # _base_name, model_name are not needed under PipelineSession.
937 # the model_data may be Pipeline variable
938 # which may break the _base_name generation
939 self._ensure_base_name_if_needed(
940 image_uri=container_def["Image"],
941 script_uri=self.source_dir,
942 model_uri=self._get_model_uri(),
943 )
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/model.py:715, in Model.prepare_container_def(self, instance_type, accelerator_type, serverless_inference_config, accept_eula, model_reference_arn)
713 deploy_env = copy.deepcopy(self.env)
714 if self.source_dir or self.dependencies or self.entry_point or self.git_config:
--> 715 self._upload_code(deploy_key_prefix, repack=self.is_repack())
716 deploy_env.update(self._script_mode_env_vars())
718 return sagemaker.container_def(
719 self.image_uri,
720 self.repacked_model_data or self.model_data,
(...)
731 ),
732 )
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/model.py:822, in Model._upload_code(self, key_prefix, repack)
806 self.uploaded_code = fw_utils.UploadedCode(
807 s3_prefix=repacked_model_data,
808 script_name=os.path.basename(self.entry_point),
809 )
811 logger.info(
812 "Repacking model artifact (%s), script artifact "
813 "(%s), and dependencies (%s) "
(...)
819 repacked_model_data,
820 )
--> 822 utils.repack_model(
823 inference_script=self.entry_point,
824 source_directory=self.source_dir,
825 dependencies=self.dependencies,
826 model_uri=self.model_data,
827 repacked_model_uri=repacked_model_data,
828 sagemaker_session=self.sagemaker_session,
829 kms_key=self.model_kms_key,
830 )
832 self.repacked_model_data = repacked_model_data
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/utils.py:551, in repack_model(inference_script, source_directory, dependencies, model_uri, repacked_model_uri, sagemaker_session, kms_key)
544 local_download_dir = (
545 None
546 if sagemaker_session.settings is None
547 or sagemaker_session.settings.local_download_dir is None
548 else sagemaker_session.settings.local_download_dir
549 )
550 with _tmpdir(directory=local_download_dir) as tmp:
--> 551 model_dir = _extract_model(model_uri, sagemaker_session, tmp)
553 _create_or_update_code_dir(
554 model_dir,
555 inference_script,
(...)
559 tmp,
560 )
562 tmp_model_path = os.path.join(tmp, "temp-model.tar.gz")
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/utils.py:637, in _extract_model(model_uri, sagemaker_session, tmp)
635 if model_uri.lower().startswith("s3://"):
636 local_model_path = os.path.join(tmp, "tar_file")
--> 637 download_file_from_url(model_uri, local_model_path, sagemaker_session)
638 else:
639 local_model_path = model_uri.replace("file://", "")
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/utils.py:650, in download_file_from_url(url, dst, sagemaker_session)
647 url = parse.urlparse(url)
648 bucket, key = url.netloc, url.path.lstrip("/")
--> 650 download_file(bucket, key, dst, sagemaker_session)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/utils.py:668, in download_file(bucket_name, path, target, sagemaker_session)
666 s3 = boto_session.resource("s3", region_name=sagemaker_session.boto_region_name)
667 bucket = s3.Bucket(bucket_name)
--> 668 bucket.download_file(path, target)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/boto3/s3/inject.py:279, in bucket_download_file(self, Key, Filename, ExtraArgs, Callback, Config)
245 def bucket_download_file(
246 self, Key, Filename, ExtraArgs=None, Callback=None, Config=None
247 ):
248 """Download an S3 object to a file.
249
250 Usage::
(...)
277 transfer.
278 """
--> 279 return self.meta.client.download_file(
280 Bucket=self.name,
281 Key=Key,
282 Filename=Filename,
283 ExtraArgs=ExtraArgs,
284 Callback=Callback,
285 Config=Config,
286 )
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/boto3/s3/inject.py:192, in download_file(self, Bucket, Key, Filename, ExtraArgs, Callback, Config)
157 """Download an S3 object to a file.
158
159 Usage::
(...)
189 transfer.
190 """
191 with S3Transfer(self, Config) as transfer:
--> 192 return transfer.download_file(
193 bucket=Bucket,
194 key=Key,
195 filename=Filename,
196 extra_args=ExtraArgs,
197 callback=Callback,
198 )
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/boto3/s3/transfer.py:406, in S3Transfer.download_file(self, bucket, key, filename, extra_args, callback)
402 future = self._manager.download(
403 bucket, key, filename, extra_args, subscribers
404 )
405 try:
--> 406 future.result()
407 # This is for backwards compatibility where when retries are
408 # exceeded we need to throw the same error from boto3 instead of
409 # s3transfer's built in RetriesExceededError as current users are
410 # catching the boto3 one instead of the s3transfer exception to do
411 # their own retries.
412 except S3TransferRetriesExceededError as e:
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/s3transfer/futures.py:103, in TransferFuture.result(self)
98 def result(self):
99 try:
100 # Usually the result() method blocks until the transfer is done,
101 # however if a KeyboardInterrupt is raised we want want to exit
102 # out of this and propagate the exception.
--> 103 return self._coordinator.result()
104 except KeyboardInterrupt as e:
105 self.cancel()
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/s3transfer/futures.py:266, in TransferCoordinator.result(self)
263 # Once done waiting, raise an exception if present or return the
264 # final result.
265 if self._exception:
--> 266 raise self._exception
267 return self._result
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/s3transfer/tasks.py:269, in SubmissionTask._main(self, transfer_future, **kwargs)
265 self._transfer_coordinator.set_status_to_running()
267 # Call the submit method to start submitting tasks to execute the
268 # transfer.
--> 269 self._submit(transfer_future=transfer_future, **kwargs)
270 except BaseException as e:
271 # If there was an exception raised during the submission of task
272 # there is a chance that the final task that signals if a transfer
(...)
281
282 # Set the exception, that caused the process to fail.
283 self._log_and_set_exception(e)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/s3transfer/download.py:354, in DownloadSubmissionTask._submit(self, client, config, osutil, request_executor, io_executor, transfer_future, bandwidth_limiter)
325 """
326 :param client: The client associated with the transfer manager
327
(...)
349 downloading streams
350 """
351 if transfer_future.meta.size is None:
352 # If a size was not provided figure out the size for the
353 # user.
--> 354 response = client.head_object(
355 Bucket=transfer_future.meta.call_args.bucket,
356 Key=transfer_future.meta.call_args.key,
357 **transfer_future.meta.call_args.extra_args,
358 )
359 transfer_future.meta.provide_transfer_size(
360 response['ContentLength']
361 )
363 download_output_manager = self._get_download_output_manager_cls(
364 transfer_future, osutil
365 )(osutil, self._transfer_coordinator, io_executor)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/botocore/client.py:565, in ClientCreator._create_api_method.<locals>._api_call(self, *args, **kwargs)
561 raise TypeError(
562 f"{py_operation_name}() only accepts keyword arguments."
563 )
564 # The "self" in this scope is referring to the BaseClient.
--> 565 return self._make_api_call(operation_name, kwargs)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/botocore/client.py:1021, in BaseClient._make_api_call(self, operation_name, api_params)
1017 error_code = error_info.get("QueryErrorCode") or error_info.get(
1018 "Code"
1019 )
1020 error_class = self.exceptions.from_code(error_code)
-> 1021 raise error_class(parsed_response, operation_name)
1022 else:
1023 return parsed_response
ClientError: An error occurred (404) when calling the HeadObject operation: Not Found
@catica The new error ClientError: An error occurred (404) when calling the HeadObject operation: Not Found
indicates that SageMaker is unable to locate the specified model data on S3. This could be due to an incorrect S3 URI for the model data, or it might be that the model data does not exist at the specified location.
possible solutions:-
from sagemaker import image_uris, model_uris, script_uris, hyperparameters
from sagemaker.model import Model
from sagemaker.predictor import Predictor
from sagemaker.utils import name_from_base
infer_model_id, infer_model_version = infer_model_dropdown.value, "*"
endpoint_name = name_from_base(f"jumpstart-example-infer-{infer_model_id}")
inference_instance_type = "ml.p2.xlarge"
deploy_image_uri = image_uris.retrieve( region=None, framework=None, # automatically inferred from model_id image_scope="inference", model_id=infer_model_id, model_version=infer_model_version, instance_type=inference_instance_type, )
deploy_source_uri = script_uris.retrieve( model_id=infer_model_id, model_version=infer_model_version, script_scope="inference" )
base_model_uri = model_uris.retrieve( model_id=infer_model_id, model_version=infer_model_version, model_scope="inference" )
print(f"Deploy Image URI: {deploy_image_uri}") print(f"Deploy Source URI: {deploy_source_uri}") print(f"Base Model URI: {base_model_uri}")
model = Model( image_uri=deploy_image_uri, source_dir=deploy_source_uri, model_data=base_model_uri, entry_point="inference.py", # entry point file in source_dir and present in deploy_source_uri role=aws_role, name=endpoint_name, )
base_model_predictor = model.deploy( initial_instance_count=1, instance_type=inference_instance_type, endpoint_name=endpoint_name, )
predictor = Predictor(endpoint_name=endpoint_name)
Plz let meknow, if you find error logs
Thanks
I am not sure I understand these problems with S3 accesses as I am trying to use the provided pytorch-SSD model (not my own model). And I'm not trying to access any of my own S3 buckets, I believe.
I've followed your debugging suggestion and changed the region to us-west-2
as described at the beginning of the notebook ([https://github.com/aws/amazon-sagemaker-examples/blob/main/introduction_to_amazon_algorithms/jumpstart_object_detection/Amazon_JumpStart_Object_Detection.ipynb]), but still get the same error.
Ps. The CICD checks are failing in the notebook above.
Using model 'pytorch-od-nvidia-ssd' with wildcard version identifier '*'. You can pin to version '2.0.0' for more stable results. Note that models may have different input/output signatures after a major version upgrade.
Using model 'pytorch-od-nvidia-ssd' with wildcard version identifier '*'. You can pin to version '2.0.0' for more stable results. Note that models may have different input/output signatures after a major version upgrade.
Deploy Image URI: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference:1.10.0-gpu-py38
Deploy Source URI: s3://jumpstart-cache-prod-us-west-2/source-directory-tarballs/pytorch/inference/od/v1.2.3/sourcedir.tar.gz
Base Model URI: s3://jumpstart-cache-prod-us-west-2/pytorch-od/pytorch-od-nvidia-ssd/artifacts/inference-prepack/v1.0.0/
---------------------------------------------------------------------------
ClientError Traceback (most recent call last)
Cell In[44], line 39
30 model = Model(
31 image_uri=deploy_image_uri,
32 source_dir=deploy_source_uri,
(...)
35 role=aws_role,
36 name=endpoint_name)
38 # Deploy the Model without predictor_cls argument
---> 39 base_model_predictor = model.deploy(
40 initial_instance_count=1,
41 instance_type=inference_instance_type,
42 endpoint_name=endpoint_name
43 )
45 # Instantiate the Predictor class separately
46 predictor = Predictor(endpoint_name=endpoint_name)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/model.py:1692, in Model.deploy(self, initial_instance_count, instance_type, serializer, deserializer, accelerator_type, endpoint_name, tags, kms_key, wait, data_capture_config, async_inference_config, serverless_inference_config, volume_size, model_data_download_timeout, container_startup_health_check_timeout, inference_recommendation_id, explainer_config, accept_eula, endpoint_logging, resources, endpoint_type, managed_instance_scaling, inference_component_name, routing_config, **kwargs)
1689 return None
1691 else: # existing single model endpoint path
-> 1692 self._create_sagemaker_model(
1693 instance_type=instance_type,
1694 accelerator_type=accelerator_type,
1695 tags=tags,
1696 serverless_inference_config=serverless_inference_config,
1697 **kwargs,
1698 )
1699 serverless_inference_config_dict = (
1700 serverless_inference_config._to_request_dict() if is_serverless else None
1701 )
1702 production_variant = sagemaker.production_variant(
1703 self.name,
1704 instance_type,
(...)
1711 routing_config=routing_config,
1712 )
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/model.py:927, in Model._create_sagemaker_model(self, instance_type, accelerator_type, tags, serverless_inference_config, accept_eula, model_reference_arn)
925 self.name = model_package.name
926 else:
--> 927 container_def = self.prepare_container_def(
928 instance_type,
929 accelerator_type=accelerator_type,
930 serverless_inference_config=serverless_inference_config,
931 accept_eula=accept_eula,
932 model_reference_arn=model_reference_arn,
933 )
935 if not isinstance(self.sagemaker_session, PipelineSession):
936 # _base_name, model_name are not needed under PipelineSession.
937 # the model_data may be Pipeline variable
938 # which may break the _base_name generation
939 self._ensure_base_name_if_needed(
940 image_uri=container_def["Image"],
941 script_uri=self.source_dir,
942 model_uri=self._get_model_uri(),
943 )
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/model.py:715, in Model.prepare_container_def(self, instance_type, accelerator_type, serverless_inference_config, accept_eula, model_reference_arn)
713 deploy_env = copy.deepcopy(self.env)
714 if self.source_dir or self.dependencies or self.entry_point or self.git_config:
--> 715 self._upload_code(deploy_key_prefix, repack=self.is_repack())
716 deploy_env.update(self._script_mode_env_vars())
718 return sagemaker.container_def(
719 self.image_uri,
720 self.repacked_model_data or self.model_data,
(...)
731 ),
732 )
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/model.py:822, in Model._upload_code(self, key_prefix, repack)
806 self.uploaded_code = fw_utils.UploadedCode(
807 s3_prefix=repacked_model_data,
808 script_name=os.path.basename(self.entry_point),
809 )
811 logger.info(
812 "Repacking model artifact (%s), script artifact "
813 "(%s), and dependencies (%s) "
(...)
819 repacked_model_data,
820 )
--> 822 utils.repack_model(
823 inference_script=self.entry_point,
824 source_directory=self.source_dir,
825 dependencies=self.dependencies,
826 model_uri=self.model_data,
827 repacked_model_uri=repacked_model_data,
828 sagemaker_session=self.sagemaker_session,
829 kms_key=self.model_kms_key,
830 )
832 self.repacked_model_data = repacked_model_data
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/utils.py:551, in repack_model(inference_script, source_directory, dependencies, model_uri, repacked_model_uri, sagemaker_session, kms_key)
544 local_download_dir = (
545 None
546 if sagemaker_session.settings is None
547 or sagemaker_session.settings.local_download_dir is None
548 else sagemaker_session.settings.local_download_dir
549 )
550 with _tmpdir(directory=local_download_dir) as tmp:
--> 551 model_dir = _extract_model(model_uri, sagemaker_session, tmp)
553 _create_or_update_code_dir(
554 model_dir,
555 inference_script,
(...)
559 tmp,
560 )
562 tmp_model_path = os.path.join(tmp, "temp-model.tar.gz")
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/utils.py:637, in _extract_model(model_uri, sagemaker_session, tmp)
635 if model_uri.lower().startswith("s3://"):
636 local_model_path = os.path.join(tmp, "tar_file")
--> 637 download_file_from_url(model_uri, local_model_path, sagemaker_session)
638 else:
639 local_model_path = model_uri.replace("file://", "")
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/utils.py:650, in download_file_from_url(url, dst, sagemaker_session)
647 url = parse.urlparse(url)
648 bucket, key = url.netloc, url.path.lstrip("/")
--> 650 download_file(bucket, key, dst, sagemaker_session)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/sagemaker/utils.py:668, in download_file(bucket_name, path, target, sagemaker_session)
666 s3 = boto_session.resource("s3", region_name=sagemaker_session.boto_region_name)
667 bucket = s3.Bucket(bucket_name)
--> 668 bucket.download_file(path, target)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/boto3/s3/inject.py:279, in bucket_download_file(self, Key, Filename, ExtraArgs, Callback, Config)
245 def bucket_download_file(
246 self, Key, Filename, ExtraArgs=None, Callback=None, Config=None
247 ):
248 """Download an S3 object to a file.
249
250 Usage::
(...)
277 transfer.
278 """
--> 279 return self.meta.client.download_file(
280 Bucket=self.name,
281 Key=Key,
282 Filename=Filename,
283 ExtraArgs=ExtraArgs,
284 Callback=Callback,
285 Config=Config,
286 )
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/boto3/s3/inject.py:192, in download_file(self, Bucket, Key, Filename, ExtraArgs, Callback, Config)
157 """Download an S3 object to a file.
158
159 Usage::
(...)
189 transfer.
190 """
191 with S3Transfer(self, Config) as transfer:
--> 192 return transfer.download_file(
193 bucket=Bucket,
194 key=Key,
195 filename=Filename,
196 extra_args=ExtraArgs,
197 callback=Callback,
198 )
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/boto3/s3/transfer.py:406, in S3Transfer.download_file(self, bucket, key, filename, extra_args, callback)
402 future = self._manager.download(
403 bucket, key, filename, extra_args, subscribers
404 )
405 try:
--> 406 future.result()
407 # This is for backwards compatibility where when retries are
408 # exceeded we need to throw the same error from boto3 instead of
409 # s3transfer's built in RetriesExceededError as current users are
410 # catching the boto3 one instead of the s3transfer exception to do
411 # their own retries.
412 except S3TransferRetriesExceededError as e:
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/s3transfer/futures.py:103, in TransferFuture.result(self)
98 def result(self):
99 try:
100 # Usually the result() method blocks until the transfer is done,
101 # however if a KeyboardInterrupt is raised we want want to exit
102 # out of this and propagate the exception.
--> 103 return self._coordinator.result()
104 except KeyboardInterrupt as e:
105 self.cancel()
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/s3transfer/futures.py:266, in TransferCoordinator.result(self)
263 # Once done waiting, raise an exception if present or return the
264 # final result.
265 if self._exception:
--> 266 raise self._exception
267 return self._result
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/s3transfer/tasks.py:269, in SubmissionTask._main(self, transfer_future, **kwargs)
265 self._transfer_coordinator.set_status_to_running()
267 # Call the submit method to start submitting tasks to execute the
268 # transfer.
--> 269 self._submit(transfer_future=transfer_future, **kwargs)
270 except BaseException as e:
271 # If there was an exception raised during the submission of task
272 # there is a chance that the final task that signals if a transfer
(...)
281
282 # Set the exception, that caused the process to fail.
283 self._log_and_set_exception(e)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/s3transfer/download.py:354, in DownloadSubmissionTask._submit(self, client, config, osutil, request_executor, io_executor, transfer_future, bandwidth_limiter)
325 """
326 :param client: The client associated with the transfer manager
327
(...)
349 downloading streams
350 """
351 if transfer_future.meta.size is None:
352 # If a size was not provided figure out the size for the
353 # user.
--> 354 response = client.head_object(
355 Bucket=transfer_future.meta.call_args.bucket,
356 Key=transfer_future.meta.call_args.key,
357 **transfer_future.meta.call_args.extra_args,
358 )
359 transfer_future.meta.provide_transfer_size(
360 response['ContentLength']
361 )
363 download_output_manager = self._get_download_output_manager_cls(
364 transfer_future, osutil
365 )(osutil, self._transfer_coordinator, io_executor)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/botocore/client.py:565, in ClientCreator._create_api_method.<locals>._api_call(self, *args, **kwargs)
561 raise TypeError(
562 f"{py_operation_name}() only accepts keyword arguments."
563 )
564 # The "self" in this scope is referring to the BaseClient.
--> 565 return self._make_api_call(operation_name, kwargs)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/botocore/client.py:1021, in BaseClient._make_api_call(self, operation_name, api_params)
1017 error_code = error_info.get("QueryErrorCode") or error_info.get(
1018 "Code"
1019 )
1020 error_class = self.exceptions.from_code(error_code)
-> 1021 raise error_class(parsed_response, operation_name)
1022 else:
1023 return parsed_response
ClientError: An error occurred (404) when calling the HeadObject operation: Not Found
I have the same problem as @catica. I can download the tar file from the S3 bucket and can read the files in the base_model_uri folder (in the terminal as user ec2-user). So permissions are set correct. @Siddharth-Latthe-07 , do you have more tips? Thanks!
@pietervosnl yeah sure Here are some additional steps and suggestions to help you troubleshoot and resolve this issue
Permissions and IAM Roles: Ensure that the IAM role used by SageMaker (aws_role) has the necessary permissions to access the S3 buckets and objects. This typically includes s3:GetObject, s3:ListBucket, and s3:HeadObject permissions. Verify the policy attached to the IAM role and make sure it grants access to the correct S3 resources.
Network Configuration: If your SageMaker notebook or training job is running in a VPC, ensure that the VPC has the necessary internet access or a VPC endpoint configured to access S3.
Model and Script Versions: When using wildcard versions for models (infer_model_version = "*") and scripts, ensure that the correct and compatible versions are being used. Try specifying an exact version to see if it resolves the issue.
Temporary S3 Bucket: As a workaround, try copying the model data to a new S3 bucket in the same region and update the URI accordingly. This can help identify if the issue is specific to the current bucket or URI.
Sample snippet for checking the URIs and the region consistency:-
from sagemaker import image_uris, model_uris, script_uris
from sagemaker.model import Model
from sagemaker.predictor import Predictor
from sagemaker.utils import name_from_base
# Set model ID and version
infer_model_id, infer_model_version = "pytorch-od-nvidia-ssd", "2.0.0"
# Generate endpoint name
endpoint_name = name_from_base(f"jumpstart-example-infer-{infer_model_id}")
# Define instance type
inference_instance_type = "ml.p2.xlarge"
# Retrieve URIs
deploy_image_uri = image_uris.retrieve(
region="us-west-2",
framework=None,
image_scope="inference",
model_id=infer_model_id,
model_version=infer_model_version,
instance_type=inference_instance_type,
)
deploy_source_uri = script_uris.retrieve(
model_id=infer_model_id, model_version=infer_model_version, script_scope="inference"
)
base_model_uri = model_uris.retrieve(
model_id=infer_model_id, model_version=infer_model_version, model_scope="inference"
)
# Print URIs for verification
print(f"Deploy Image URI: {deploy_image_uri}")
print(f"Deploy Source URI: {deploy_source_uri}")
print(f"Base Model URI: {base_model_uri}")
# Create the SageMaker model instance
model = Model(
image_uri=deploy_image_uri,
source_dir=deploy_source_uri,
model_data=base_model_uri,
entry_point="inference.py",
role=aws_role,
name=endpoint_name,
)
# Deploy the model
base_model_predictor = model.deploy(
initial_instance_count=1,
instance_type=inference_instance_type,
endpoint_name=endpoint_name,
)
# Instantiate the Predictor class
predictor = Predictor(endpoint_name=endpoint_name)
Hope this helps, Thanks
Link to the notebook Example Notebook
Describe the bug We were trying to test Amazon SageMaker's semantic segmentation feature. So we used this example notebook from sagemaker jumpstart and tried to run it in aws sagemaker notebook instance.
But we received a weird error:
TypeError: Model._create_sagemaker_model() got an unexpected keyword argument 'predictor_cls'
To reproduce
Logs