dask / dask-cloudprovider

Cloud provider cluster managers for Dask. Supports AWS, Google Cloud Azure and more...
https://cloudprovider.dask.org
BSD 3-Clause "New" or "Revised" License
135 stars 110 forks source link

(GCP) `GCPCluster` Throws OAuth errors when attempting to create a new cluster. #281

Open drobison00 opened 3 years ago

drobison00 commented 3 years ago

What happened: RefreshError: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', '{"error":"invalid_scope","error_description":"Invalid OAuth scope or ID token audience provided."}')

What you expected to happen: Cluster creation to succeed

Minimal Complete Verifiable Example:

cloudprovider:
   gcp:
      projectid: "<correct project>"
from dask.distributed import Client, wait, get_worker
from dask_cloudprovider.gcp import GCPCluster

cluster = GCPCluster(projectid="<correct project>",
                     machine_type="n1-standard-4",
                     zone="us-central1-c",
                     ngpus=1,
                     gpu_type="nvidia-tesla-t4",
                     n_workers=1)
client = Client(cluster)
---------------------------------------------------------------------------
RefreshError                              Traceback (most recent call last)
<ipython-input-6-032f333965aa> in <module>
----> 1 cluster = GCPCluster(projectid="<correct project>",
      2                      machine_type="n1-standard-4",
      3                      zone="us-central1-c",
      4                      ngpus=1,
      5                      gpu_type="nvidia-tesla-t4",

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/dask_cloudprovider/gcp/instances.py in __init__(self, projectid, zone, network, machine_type, on_host_maintenance, source_image, docker_image, ngpus, gpu_type, filesystem_size, disk_type, auto_shutdown, bootstrap, preemptible, debug, **kwargs)
    601         self.worker_options = {**self.options}
    602 
--> 603         super().__init__(debug=debug, **kwargs)
    604 
    605 

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/dask_cloudprovider/generic/vmcluster.py in __init__(self, n_workers, worker_class, worker_options, scheduler_options, docker_image, docker_args, env_vars, security, protocol, debug, **kwargs)
    287         self.uuid = str(uuid.uuid4())[:8]
    288 
--> 289         super().__init__(**kwargs, security=self.security)
    290 
    291     async def call_async(self, f, *args, **kwargs):

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/distributed/deploy/spec.py in __init__(self, workers, scheduler, worker, asynchronous, loop, security, silence_logs, name, shutdown_on_close)
    279         if not self.asynchronous:
    280             self._loop_runner.start()
--> 281             self.sync(self._start)
    282             self.sync(self._correct_state)
    283 

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/distributed/deploy/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    187             return future
    188         else:
--> 189             return sync(self.loop, func, *args, **kwargs)
    190 
    191     def _log(self, log):

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    338     if error[0]:
    339         typ, exc, tb = error[0]
--> 340         raise exc.with_traceback(tb)
    341     else:
    342         return result[0]

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/distributed/utils.py in f()
    322             if callback_timeout is not None:
    323                 future = asyncio.wait_for(future, callback_timeout)
--> 324             result[0] = yield future
    325         except Exception as exc:
    326             error[0] = sys.exc_info()

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/tornado/gen.py in run(self)
    760 
    761                     try:
--> 762                         value = future.result()
    763                     except Exception:
    764                         exc_info = sys.exc_info()

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/dask_cloudprovider/generic/vmcluster.py in _start(self)
    327             "Hang tight! ",
    328         ):
--> 329             await super()._start()
    330 
    331     def render_process_cloud_init(self, process):

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/distributed/deploy/spec.py in _start(self)
    307 
    308         self.status = Status.starting
--> 309         self.scheduler = await self.scheduler
    310         self.scheduler_comm = rpc(
    311             getattr(self.scheduler, "external_address", None) or self.scheduler.address,

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/distributed/deploy/spec.py in _()
     69             async with self.lock:
     70                 if self.status == Status.created:
---> 71                     await self.start()
     72                     assert self.status == Status.running
     73             return self

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/dask_cloudprovider/gcp/instances.py in start(self)
    281 
    282     async def start(self):
--> 283         await self.start_scheduler()
    284         self.status = Status.running
    285 

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/dask_cloudprovider/gcp/instances.py in start_scheduler(self)
    296         )
    297         self.cluster._log("Creating scheduler instance")
--> 298         self.internal_ip, self.external_ip = await self.create_vm()
    299 
    300         if self.config.get("public_ingress", True) and not is_inside_gce():

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/dask_cloudprovider/gcp/instances.py in create_vm(self)
    196 
    197         try:
--> 198             inst = await self.cluster.call_async(
    199                 self.cluster.compute.instances()
    200                 .insert(project=self.projectid, zone=self.zone, body=self.gcp_config)

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/dask_cloudprovider/generic/vmcluster.py in call_async(self, f, *args, **kwargs)
    302             return_when=asyncio.ALL_COMPLETED,
    303         )
--> 304         return done.result()
    305 
    306     async def _start(

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/concurrent/futures/thread.py in run(self)
     55 
     56         try:
---> 57             result = self.fn(*self.args, **self.kwargs)
     58         except BaseException as exc:
     59             self.future.set_exception(exc)

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/dask_cloudprovider/generic/vmcluster.py in <lambda>()
    299         """
    300         [done], _ = await asyncio.wait(
--> 301             fs={self.loop.run_in_executor(None, lambda: f(*args, **kwargs))},
    302             return_when=asyncio.ALL_COMPLETED,
    303         )

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/googleapiclient/_helpers.py in positional_wrapper(*args, **kwargs)
    132                 elif positional_parameters_enforcement == POSITIONAL_WARNING:
    133                     logger.warning(message)
--> 134             return wrapped(*args, **kwargs)
    135 
    136         return positional_wrapper

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/googleapiclient/http.py in execute(self, http, num_retries)
    918 
    919         # Handle retries for server-side errors.
--> 920         resp, content = _retry_request(
    921             http,
    922             num_retries,

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/googleapiclient/http.py in _retry_request(http, num_retries, req_type, sleep, rand, uri, method, *args, **kwargs)
    189         try:
    190             exception = None
--> 191             resp, content = http.request(uri, method, *args, **kwargs)
    192         # Retry on SSL errors and socket timeout errors.
    193         except _ssl_SSLError as ssl_error:

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/google_auth_httplib2.py in request(self, uri, method, body, headers, redirections, connection_type, **kwargs)
    207         request_headers = headers.copy() if headers is not None else {}
    208 
--> 209         self.credentials.before_request(self._request, method, uri, request_headers)
    210 
    211         # Check if the body is a file-like stream, and if so, save the body

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/google/auth/credentials.py in before_request(self, request, method, url, headers)
    131         # the http request.)
    132         if not self.valid:
--> 133             self.refresh(request)
    134         self.apply(headers)
    135 

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/google/oauth2/service_account.py in refresh(self, request)
    359     def refresh(self, request):
    360         assertion = self._make_authorization_grant_assertion()
--> 361         access_token, expiry, _ = _client.jwt_grant(request, self._token_uri, assertion)
    362         self.token = access_token
    363         self.expiry = expiry

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/google/oauth2/_client.py in jwt_grant(request, token_uri, assertion)
    151     body = {"assertion": assertion, "grant_type": _JWT_GRANT_TYPE}
    152 
--> 153     response_data = _token_endpoint_request(request, token_uri, body)
    154 
    155     try:

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/google/oauth2/_client.py in _token_endpoint_request(request, token_uri, body)
    122                 retry += 1
    123                 continue
--> 124             _handle_error_response(response_body)
    125 
    126     return response_data

~/anaconda3/envs/rapids-core-0.18/lib/python3.8/site-packages/google/oauth2/_client.py in _handle_error_response(response_body)
     58         error_details = response_body
     59 
---> 60     raise exceptions.RefreshError(error_details, response_body)
     61 
     62 

RefreshError: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', '{"error":"invalid_scope","error_description":"Invalid OAuth scope or ID token audience provided."}')

Environment: Conda environment

quasiben commented 3 years ago

Hmm, I'm not sure what the issue here is. In addition to running gsutil commands can you also try a few gcloud compute compute commands to create machines ? Perhaps you account does not have the correct perms to create compute instances ?

drobison00 commented 3 years ago

@quasiben Just checked, something like this works fine with gcloud compute

gcloud compute instances create drobisontest --project "<correct-project>" --machine-type "a2-highgpu-1g" --zone "us-central1-c"    --image-family tf2-ent-2-3-cu110    --image-project deeplearning-platform-release    --boot-disk-size 200GB    --metadata "install-nvidia-driver=True,proxy-mode=project_editors"    --scopes https://www.googleapis.com/auth/cloud-platform --maintenance-policy TERMINATE --restart-on-failure
NAME          ZONE           MACHINE_TYPE   PREEMPTIBLE  INTERNAL_IP  EXTERNAL_IP   STATUS
drobisontest  us-central1-c  a2-highgpu-1g              .....  .....  RUNNING
akaanirban commented 3 years ago

I ran this recently on GCP. I was unable to reproduce the RefreshError: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', '{"error":"invalid_scope","error_description":"Invalid OAuth scope or ID token audience provided."}') error. Something may have fixed it.

However, I did run into similar issues as Issue #292 . On looking at the cloud-init-output.log , it appears that the scheduler VM shuts down when trying to start the daskdev:dask:latest docker image with the following error: docker: Error response from daemon: OCI runtime create failed: container_linux.go:380: starting container process caused: process_linux.go:545: container init caused: Running hook #0:: error running hook: exit status 1, stdout: , stderr: nvidia-container-cli: initialization error: nvml error: driver not loaded: unknown.

Further, I tried using a custom existing NGC image like the following :

from dask.distributed import Client, wait, get_worker
from dask_cloudprovider.gcp import GCPCluster
cluster = GCPCluster(projectid="nv-ai-infra",
                     machine_type="n1-standard-4",
                     zone="us-central1-a",
                     ngpus=1,
                     gpu_type="nvidia-tesla-v100",
                     n_workers=1,
                     source_image="projects/nvidia-ngc-public/global/images/nvidia-gpu-cloud-image-pytorch-20210609",
                     debug=True,
                     bootstrap=False,
                     silence_logs=False)

This fails with the same error. I would imagine passing a custom image which has NVIDIA drivers preinstalled would probably work. Is there such an image ?

jacobtomlinson commented 3 years ago

Any of the RAPIDS images should be ok.

tszumowski commented 2 years ago

Just wanted to mention I ran into the same issue as @drobison00 . No need for me to paste the output. It's literally the same exact error.

RefreshError: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', '{"error":"invalid_scope","error_description":"Invalid OAuth scope or ID token audience provided."}')

This is with the example from the docs:

from dask_cloudprovider.gcp import GCPCluster

cluster = GCPCluster(projectid=[PROJECT], machine_type="n1-standard-4", zone="us-east1-b")
client = Client(cluster)

The only way I got it to work was by:

  1. creating a service account,
  2. Providing Service Account User IAM role to myself and the account (not sure if both were needed)
  3. Using the service account key when calling above.

I don't know if this issue is unique to dask though. I generally have OAuth token issues with several python libraries that try to use a subset of GCP services, particularly via the REST API. e.g. Google Sheets.