dask / dask-cloudprovider

Cloud provider cluster managers for Dask. Supports AWS, Google Cloud Azure and more...
https://cloudprovider.dask.org
BSD 3-Clause "New" or "Revised" License
130 stars 107 forks source link

Problem with cleanup and region #71

Open rsignell-usgs opened 4 years ago

rsignell-usgs commented 4 years ago

For some reason cleanup is not recognizing the region:

from dask_cloudprovider import FargateCluster
cluster = FargateCluster(n_workers=1, 
                         image='rsignell/pangeo-worker:2020-01-23c', region_name='us-west-2')

gives

---------------------------------------------------------------------------
NoRegionError                             Traceback (most recent call last)
<timed exec> in <module>

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/dask_cloudprovider/providers/aws/ecs.py in __init__(self, **kwargs)
   1099 
   1100     def __init__(self, **kwargs):
-> 1101         super().__init__(fargate_scheduler=True, fargate_workers=True, **kwargs)
   1102 
   1103 

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/dask_cloudprovider/providers/aws/ecs.py in __init__(self, fargate_scheduler, fargate_workers, image, scheduler_cpu, scheduler_mem, scheduler_timeout, worker_cpu, worker_mem, worker_gpu, n_workers, cluster_arn, cluster_name_template, execution_role_arn, task_role_arn, task_role_policies, cloudwatch_logs_group, cloudwatch_logs_stream_prefix, cloudwatch_logs_default_retention, vpc, subnets, security_groups, environment, tags, find_address_timeout, skip_cleanup, aws_access_key_id, aws_secret_access_key, region_name, **kwargs)
    593         self._region_name = region_name
    594         self._lock = asyncio.Lock()
--> 595         super().__init__(**kwargs)
    596 
    597     async def _start(self,):

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/distributed/deploy/spec.py in __init__(self, workers, scheduler, worker, asynchronous, loop, security, silence_logs, name)
    254         if not self.asynchronous:
    255             self._loop_runner.start()
--> 256             self.sync(self._start)
    257             self.sync(self._correct_state)
    258 

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/distributed/deploy/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    158             return future
    159         else:
--> 160             return sync(self.loop, func, *args, **kwargs)
    161 
    162     async def _logs(self, scheduler=True, workers=True):

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    346     if error[0]:
    347         typ, exc, tb = error[0]
--> 348         raise exc.with_traceback(tb)
    349     else:
    350         return result[0]

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/distributed/utils.py in f()
    330             if callback_timeout is not None:
    331                 future = asyncio.wait_for(future, callback_timeout)
--> 332             result[0] = yield future
    333         except Exception as exc:
    334             error[0] = sys.exc_info()

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/tornado/gen.py in run(self)
    733 
    734                     try:
--> 735                         value = future.result()
    736                     except Exception:
    737                         exc_info = sys.exc_info()

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/dask_cloudprovider/providers/aws/ecs.py in _start(self)
    609             self._skip_cleanup = self.config.get("skip_cleanup")
    610         if not self._skip_cleanup:
--> 611             await _cleanup_stale_resources()
    612 
    613         self._clients = await self._get_clients(

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/dask_cloudprovider/providers/aws/ecs.py in _cleanup_stale_resources()
   1118     # Clean up clusters (clusters with no running tasks)
   1119     session = aiobotocore.get_session()
-> 1120     async with session.create_client("ecs") as ecs:
   1121         active_clusters = []
   1122         clusters_to_delete = []

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/aiobotocore/session.py in create_client(self, service_name, region_name, api_version, use_ssl, verify, endpoint_url, aws_access_key_id, aws_secret_access_key, aws_session_token, config)
     74             is_secure=use_ssl, endpoint_url=endpoint_url, verify=verify,
     75             credentials=credentials, scoped_config=self.get_scoped_config(),
---> 76             client_config=config, api_version=api_version)
     77         monitor = self._get_internal_component('monitor')
     78         if monitor is not None:

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/botocore/client.py in create_client(self, service_name, region_name, is_secure, endpoint_url, verify, credentials, scoped_config, api_version, client_config)
     83         client_args = self._get_client_args(
     84             service_model, region_name, is_secure, endpoint_url,
---> 85             verify, credentials, scoped_config, client_config, endpoint_bridge)
     86         service_client = cls(**client_args)
     87         self._register_retries(service_client)

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/aiobotocore/client.py in _get_client_args(self, service_model, region_name, is_secure, endpoint_url, verify, credentials, scoped_config, client_config, endpoint_bridge)
     39         return args_creator.get_client_args(
     40             service_model, region_name, is_secure, endpoint_url,
---> 41             verify, credentials, scoped_config, client_config, endpoint_bridge)
     42 
     43 

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/aiobotocore/args.py in get_client_args(self, service_model, region_name, is_secure, endpoint_url, verify, credentials, scoped_config, client_config, endpoint_bridge)
     18         final_args = self.compute_client_args(
     19             service_model, client_config, endpoint_bridge, region_name,
---> 20             endpoint_url, is_secure, scoped_config)
     21 
     22         # service_name = final_args['service_name']

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/botocore/args.py in compute_client_args(self, service_model, client_config, endpoint_bridge, region_name, endpoint_url, is_secure, scoped_config)
    151             is_secure=is_secure,
    152             endpoint_bridge=endpoint_bridge,
--> 153             s3_config=s3_config,
    154         )
    155         # Create a new client config to be passed to the client based

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/botocore/args.py in _compute_endpoint_config(self, service_name, region_name, endpoint_url, is_secure, endpoint_bridge, s3_config)
    216         if service_name == 'sts':
    217             return self._compute_sts_endpoint_config(**resolve_endpoint_kwargs)
--> 218         return self._resolve_endpoint(**resolve_endpoint_kwargs)
    219 
    220     def _compute_s3_endpoint_config(self, s3_config,

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/botocore/args.py in _resolve_endpoint(self, service_name, region_name, endpoint_url, is_secure, endpoint_bridge)
    299                           endpoint_url, is_secure, endpoint_bridge):
    300         return endpoint_bridge.resolve(
--> 301             service_name, region_name, endpoint_url, is_secure)
    302 
    303     def _compute_socket_options(self, scoped_config):

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/botocore/client.py in resolve(self, service_name, region_name, endpoint_url, is_secure)
    359         region_name = self._check_default_region(service_name, region_name)
    360         resolved = self.endpoint_resolver.construct_endpoint(
--> 361             service_name, region_name)
    362         if resolved:
    363             return self._create_endpoint(

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/botocore/regions.py in construct_endpoint(self, service_name, region_name)
    120         for partition in self._endpoint_data['partitions']:
    121             result = self._endpoint_for_partition(
--> 122                 partition, service_name, region_name)
    123             if result:
    124                 return result

~/SageMaker/myenvs/pangeo/lib/python3.7/site-packages/botocore/regions.py in _endpoint_for_partition(self, partition, service_name, region_name)
    133                 region_name = service_data['partitionEndpoint']
    134             else:
--> 135                 raise NoRegionError()
    136         # Attempt to resolve the exact region for this partition.
    137         if region_name in service_data['endpoints']:

NoRegionError: You must specify a region.

you can work around this by specifying:

import os
os.environ['AWS_DEFAULT_REGION'] = 'us-west-2'

or by setting skip_cleanup=True:

from dask_cloudprovider import FargateCluster
cluster = FargateCluster(n_workers=1, 
                         image='rsignell/pangeo-worker:2020-01-23c',
                         skip_cleanup=True)
jacobtomlinson commented 4 years ago

Thanks for raising this @rsignell-usgs.

It looks like when we cleanup we create a new boto client. And we are not passing the same kwargs that we do when creating the regular ones.

Regular https://github.com/dask/dask-cloudprovider/blob/c041218d43f64701469c8a9ba97f79f0c9761e6d/dask_cloudprovider/providers/aws/ecs.py#L778 https://github.com/dask/dask-cloudprovider/blob/c041218d43f64701469c8a9ba97f79f0c9761e6d/dask_cloudprovider/providers/aws/ecs.py#L613-L617

Cleanup https://github.com/dask/dask-cloudprovider/blob/c041218d43f64701469c8a9ba97f79f0c9761e6d/dask_cloudprovider/providers/aws/ecs.py#L1120


So it looks like this would also be an issue if the keys were passed in programmatically or via config too. We should pass these options on to the cleanup function.