GoogleCloudPlatform / vertex-ai-samples

Notebooks, code samples, sample apps, and other resources that demonstrate how to use, develop and manage machine learning and generative AI workflows using Google Cloud Vertex AI.
https://cloud.google.com/vertex-ai
Apache License 2.0
1.63k stars 806 forks source link

Customer states that the notebook does not work because the container never becomes ready #3485

Open aalferez123 opened 2 weeks ago

aalferez123 commented 2 weeks ago

notebooks/community/vertex_endpoints/torchserve/dreambooth_stablediffusion.ipynb

Expected Behavior

Being able to hit the endpoint running StableDiffusion

Actual Behavior

FailedPrecondition: 400 Model server never became ready. Please validate that your model file or container configuration are valid. Model server logs can be found at

Steps to Reproduce the Problem

1.Deploy the notebook

  1. Wait for it to deploy

Specifications

notebooks/community/vertex_endpoints/torchserve/dreambooth_stablediffusion.ipynb

aalferez123 commented 2 weeks ago

Here is a short snapshot of error:

---------------------------------------------------------------------------
FailedPrecondition                        Traceback (most recent call last)
Cell In[19], line 1
----> 1 model.deploy(
      2     endpoint=endpoint,
      3     deployed_model_display_name=MODEL_DISPLAY_NAME,
      4     machine_type="n1-standard-8",
      5     accelerator_type="NVIDIA_TESLA_P100",
      6     accelerator_count=1,
      7     traffic_percentage=100,
      8     deploy_request_timeout=1200,
      9     sync=True,
     10 )

File /opt/conda/lib/python3.10/site-packages/google/cloud/aiplatform/models.py:4876, in Model.deploy(self, endpoint, deployed_model_display_name, traffic_percentage, traffic_split, machine_type, min_replica_count, max_replica_count, accelerator_type, accelerator_count, tpu_topology, service_account, explanation_metadata, explanation_parameters, metadata, encryption_spec_key_name, network, sync, deploy_request_timeout, autoscaling_target_cpu_utilization, autoscaling_target_accelerator_duty_cycle, enable_access_logging, disable_container_logging, private_service_connect_config, deployment_resource_pool)
   4865         raise ValueError(
   4866             "Traffic splitting is not yet supported for PSA based PrivateEndpoint. "
   4867             "Try calling deploy() without providing `traffic_split`. "
   4868             "A maximum of one model can be deployed to each private Endpoint."
   4869         )
   4871 explanation_spec = _explanation_utils.create_and_validate_explanation_spec(
   4872     explanation_metadata=explanation_metadata,
   4873     explanation_parameters=explanation_parameters,
   4874 )
-> 4876 return self._deploy(
   4877     endpoint=endpoint,
   4878     deployed_model_display_name=deployed_model_display_name,
   4879     traffic_percentage=traffic_percentage,
   4880     traffic_split=traffic_split,
   4881     machine_type=machine_type,
   4882     min_replica_count=min_replica_count,
   4883     max_replica_count=max_replica_count,
   4884     accelerator_type=accelerator_type,
   4885     accelerator_count=accelerator_count,
   4886     tpu_topology=tpu_topology,
   4887     service_account=service_account,
   4888     explanation_spec=explanation_spec,
   4889     metadata=metadata,
   4890     encryption_spec_key_name=encryption_spec_key_name
   4891     or initializer.global_config.encryption_spec_key_name,
   4892     network=network,
   4893     sync=sync,
   4894     deploy_request_timeout=deploy_request_timeout,
   4895     autoscaling_target_cpu_utilization=autoscaling_target_cpu_utilization,
   4896     autoscaling_target_accelerator_duty_cycle=autoscaling_target_accelerator_duty_cycle,
   4897     enable_access_logging=enable_access_logging,
   4898     disable_container_logging=disable_container_logging,
   4899     private_service_connect_config=private_service_connect_config,
   4900     deployment_resource_pool=deployment_resource_pool,
   4901 )

File /opt/conda/lib/python3.10/site-packages/google/cloud/aiplatform/base.py:863, in optional_sync.<locals>.optional_run_in_thread.<locals>.wrapper(*args, **kwargs)
    861     if self:
    862         VertexAiResourceNounWithFutureManager.wait(self)
--> 863     return method(*args, **kwargs)
    865 # callbacks to call within the Future (in same Thread)
    866 internal_callbacks = []

File /opt/conda/lib/python3.10/site-packages/google/cloud/aiplatform/models.py:5069, in Model._deploy(self, endpoint, deployed_model_display_name, traffic_percentage, traffic_split, machine_type, min_replica_count, max_replica_count, accelerator_type, accelerator_count, tpu_topology, service_account, explanation_spec, metadata, encryption_spec_key_name, network, sync, deploy_request_timeout, autoscaling_target_cpu_utilization, autoscaling_target_accelerator_duty_cycle, enable_access_logging, disable_container_logging, private_service_connect_config, deployment_resource_pool)
   5057         endpoint = PrivateEndpoint.create(
   5058             display_name=display_name,
   5059             network=network,
   (...)
   5064             private_service_connect_config=private_service_connect_config,
   5065         )
   5067 _LOGGER.log_action_start_against_resource("Deploying model to", "", endpoint)
-> 5069 endpoint._deploy_call(
   5070     endpoint.api_client,
   5071     endpoint.resource_name,
   5072     self,
   5073     endpoint._gca_resource.traffic_split,
   5074     network=network or endpoint.network,
   5075     deployed_model_display_name=deployed_model_display_name,
   5076     traffic_percentage=traffic_percentage,
   5077     traffic_split=traffic_split,
   5078     machine_type=machine_type,
   5079     min_replica_count=min_replica_count,
   5080     max_replica_count=max_replica_count,
   5081     accelerator_type=accelerator_type,
   5082     accelerator_count=accelerator_count,
   5083     tpu_topology=tpu_topology,
   5084     service_account=service_account,
   5085     explanation_spec=explanation_spec,
   5086     metadata=metadata,
   5087     deploy_request_timeout=deploy_request_timeout,
   5088     autoscaling_target_cpu_utilization=autoscaling_target_cpu_utilization,
   5089     autoscaling_target_accelerator_duty_cycle=autoscaling_target_accelerator_duty_cycle,
   5090     enable_access_logging=enable_access_logging,
   5091     disable_container_logging=disable_container_logging,
   5092     deployment_resource_pool=deployment_resource_pool,
   5093 )
   5095 _LOGGER.log_action_completed_against_resource("model", "deployed", endpoint)
   5097 endpoint._sync_gca_resource()

File /opt/conda/lib/python3.10/site-packages/google/cloud/aiplatform/models.py:1827, in Endpoint._deploy_call(cls, api_client, endpoint_resource_name, model, endpoint_resource_traffic_split, network, deployed_model_display_name, traffic_percentage, traffic_split, machine_type, min_replica_count, max_replica_count, accelerator_type, accelerator_count, tpu_topology, service_account, explanation_spec, metadata, deploy_request_timeout, autoscaling_target_cpu_utilization, autoscaling_target_accelerator_duty_cycle, enable_access_logging, disable_container_logging, deployment_resource_pool)
   1815 operation_future = api_client.deploy_model(
   1816     endpoint=endpoint_resource_name,
   1817     deployed_model=deployed_model,
   (...)
   1820     timeout=deploy_request_timeout,
   1821 )
   1823 _LOGGER.log_action_started_against_resource_with_lro(
   1824     "Deploy", "model", cls, operation_future
   1825 )
-> 1827 operation_future.result(timeout=None)

File /opt/conda/lib/python3.10/site-packages/google/api_core/future/polling.py:261, in PollingFuture.result(self, timeout, retry, polling)
    256 self._blocking_poll(timeout=timeout, retry=retry, polling=polling)
    258 if self._exception is not None:
    259     # pylint: disable=raising-bad-type
    260     # Pylint doesn't recognize that this is valid in this case.
--> 261     raise self._exception
    263 return self._result

FailedPrecondition: 400 Model server never became ready. Please validate that your model file or container configuration are valid. Model server logs can be found at https://console.cloud.google.com/logs/viewer?project=934458780151&resource=aiplatform.googleapis.com%2FDeploymentResourcePool&advancedFilter=resource.type%3D%22aiplatform.googleapis.com%2FDeploymentResourcePool%22%0Aresource.labels.deployment_resource_pool_id%3D%22internal_756661912002887680%22%0Aresource.labels.location%3D%22us-central1%22.