Closed RichardScottOZ closed 4 months ago
Hi @RichardScottOZ - right now, we’re fairly “hands off” as to how you configure your AWS credentials. This will likely change in the medium term but for now, you have a number of options, such as: AWS environment variables
The arraylake client uses AioBotoCore (similar to boto3) and picks up credentials in the same way.
Is there another way you were hoping to configure access to S3?
Just wondering about possibilities
e.g. if I have profile [default] profile [boringothercompayone] profile [arraylake]
that sort of thing
so if doing notebook type tests would pass it some config type setup read from config as such - can you do similar with AioBotoCore?
This sort of thing
def get_aws_credentials():
parser = configparser.RawConfigParser()
#parser.read(os.path.expanduser('~/.aws/config'))
parser.read(r'C:\Users\rscott\.aws\config')
print(parser)
#config = parser.items('default')
parser.read(os.path.expanduser('~/.aws/credentials'))
credentials = parser.items('default')
#all_credentials = {key.upper(): value for key, value in [*config, *credentials]}
all_credentials = {key.upper(): value for key, value in [*credentials]}
with contextlib.suppress(KeyError):
all_credentials["AWS_REGION"] = all_credentials.pop("REGION")
return all_credentials
creds = get_aws_credentials()
access_key=creds['AWS_ACCESS_KEY_ID']
secret_key=creds['AWS_SECRET_ACCESS_KEY']
s3 = s3fs.S3FileSystem(anon=False, key=access_key, secret=secret_key, client_kwargs=client_kwargs)
store = s3fs.S3Map(root=s3_path, s3=s3, check=False)
modeldata = xr.open_zarr(store=store, mask_and_scale=True)
so having set explicitly environment variables for access at the start and using the emailed example
client = al.Client()
repo_name = "OZ-Minerals/test"
# Open your data using Xarray
# ds = xr.open_dataset(...)
# or use the tutorial dataset (requires the 'pooch' package)
ds = xr.tutorial.open_dataset("air_temperature")
# Open your existing repository
repo = client.get_or_create_repo(repo_name)
repo.checkout()
# Write your dataset to Arraylake
ds.to_zarr(repo.store, group="mygroup", zarr_version=3, mode="w")
#ds.to_zarr(repo.store, group="mygroup", mode="w")
# Make your first commit.
repo.commit("my first commit!")
---------------------------------------------------------------------------
HTTPStatusError Traceback (most recent call last)
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\arraylake_client\api_utils.py:267, in handle_response(response)
266 try:
--> 267 response.raise_for_status()
268 except httpx.RequestError as exc:
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\httpx\_models.py:749, in Response.raise_for_status(self)
748 message = message.format(self, error_type=error_type)
--> 749 raise HTTPStatusError(message, request=request, response=self)
HTTPStatusError: Client error '422 Unprocessable Entity' for url 'https://api.earthmover.io/repos/OZ-Minerals/test/contents/metadata/_bulk_get?session_id=94888c9cb6c24e7baf4d42ca42fe3dee&commit_id='
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/422
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
Cell In[4], line 13
11 repo.checkout()
12 # Write your dataset to Arraylake
---> 13 ds.to_zarr(repo.store, group="mygroup", zarr_version=3, mode="w")
14 #ds.to_zarr(repo.store, group="mygroup", mode="w")
15 # Make your first commit.
16 repo.commit("my first commit!")
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\xarray\core\dataset.py:2099, in Dataset.to_zarr(self, store, chunk_store, mode, synchronizer, group, encoding, compute, consolidated, append_dim, region, safe_chunks, storage_options, zarr_version)
1982 """Write dataset contents to a zarr group.
1983
1984 Zarr chunks are determined in the following way:
(...)
2095 The I/O user guide, with more details and examples.
2096 """
2097 from xarray.backends.api import to_zarr
-> 2099 return to_zarr( # type: ignore
2100 self,
2101 store=store,
2102 chunk_store=chunk_store,
2103 storage_options=storage_options,
2104 mode=mode,
2105 synchronizer=synchronizer,
2106 group=group,
2107 encoding=encoding,
2108 compute=compute,
2109 consolidated=consolidated,
2110 append_dim=append_dim,
2111 region=region,
2112 safe_chunks=safe_chunks,
2113 zarr_version=zarr_version,
2114 )
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\xarray\backends\api.py:1629, in to_zarr(dataset, store, chunk_store, mode, synchronizer, group, encoding, compute, consolidated, append_dim, region, safe_chunks, storage_options, zarr_version)
1627 already_consolidated = False
1628 consolidate_on_close = consolidated or consolidated is None
-> 1629 zstore = backends.ZarrStore.open_group(
1630 store=mapper,
1631 mode=mode,
1632 synchronizer=synchronizer,
1633 group=group,
1634 consolidated=already_consolidated,
1635 consolidate_on_close=consolidate_on_close,
1636 chunk_store=chunk_mapper,
1637 append_dim=append_dim,
1638 write_region=region,
1639 safe_chunks=safe_chunks,
1640 stacklevel=4, # for Dataset.to_zarr()
1641 zarr_version=zarr_version,
1642 )
1644 if mode in ["a", "r+"]:
1645 _validate_datatypes_for_zarr_append(zstore, dataset)
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\xarray\backends\zarr.py:425, in ZarrStore.open_group(cls, store, mode, synchronizer, group, consolidated, consolidate_on_close, chunk_store, storage_options, append_dim, write_region, safe_chunks, stacklevel, zarr_version)
423 zarr_group = zarr.open_consolidated(store, **open_kwargs)
424 else:
--> 425 zarr_group = zarr.open_group(store, **open_kwargs)
426 return cls(
427 zarr_group,
428 mode,
(...)
432 safe_chunks,
433 )
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\zarr\hierarchy.py:1528, in open_group(store, mode, cache_attrs, synchronizer, path, chunk_store, storage_options, zarr_version, meta_array)
1525 raise GroupNotFoundError(path)
1527 elif mode == "w":
-> 1528 init_group(store, overwrite=True, path=path, chunk_store=chunk_store)
1530 elif mode == "a":
1531 if not contains_group(store, path=path):
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\zarr\storage.py:675, in init_group(store, overwrite, path, chunk_store)
672 store["zarr.json"] = store._metadata_class.encode_hierarchy_metadata(None) # type: ignore
674 # initialise metadata
--> 675 _init_group_metadata(store=store, overwrite=overwrite, path=path, chunk_store=chunk_store)
677 if store_version == 3:
678 # TODO: Should initializing a v3 group also create a corresponding
679 # empty folder under data/root/? I think probably not until there
680 # is actual data written there.
681 pass
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\zarr\storage.py:708, in _init_group_metadata(store, overwrite, path, chunk_store)
705 meta_prefix = meta_root + _path_to_prefix(path)
707 # attempt to delete any pre-existing array in store
--> 708 if array_meta_key in store:
709 store.erase(array_meta_key) # type: ignore
710 if group_meta_key in store:
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\arraylake_client\repo.py:1380, in ArraylakeStore.__contains__(self, key)
1378 return self._repo._chunk_exists(key)
1379 elif is_meta_key(key):
-> 1380 return self._repo._doc_exists(key)
1381 else: # pragma: no cover
1382 # this should never actually happen because a valid key will always resolve
1383 # to either a meta key or a chunk key
1384 return False
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\arraylake_client\repo.py:1041, in Repo._doc_exists(self, path)
1040 def _doc_exists(self, path: Path) -> bool:
-> 1041 return self._synchronize(self._arepo._doc_exists, path)
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\arraylake_client\repo.py:958, in Repo._synchronize(self, method, *args, **kwargs)
954 @functools.wraps(method)
955 def wrap(*args, **kwargs):
956 return sync(method, *args, **kwargs)
--> 958 return wrap(*args, **kwargs)
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\arraylake_client\repo.py:956, in Repo._synchronize.<locals>.wrap(*args, **kwargs)
954 @functools.wraps(method)
955 def wrap(*args, **kwargs):
--> 956 return sync(method, *args, **kwargs)
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\arraylake_client\asyn.py:118, in sync(func, timeout, *args, **kwargs)
115 return_result = result[0]
116 if isinstance(return_result, BaseException):
117 # note: fsspec has special handling for asyncio.TimeoutError; here we just raise it
--> 118 raise return_result
119 else:
120 return return_result
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\arraylake_client\asyn.py:71, in _runner(event, coro, result, timeout)
69 coro = asyncio.wait_for(coro, timeout=timeout)
70 try:
---> 71 result[0] = await coro
72 except Exception as ex:
73 result[0] = ex
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\arraylake_client\repo.py:464, in AsyncRepo._doc_exists(self, path)
452 """Check if a doc exists in the metastore
453
454 Parameters
(...)
457 Document path
458 """
459 try:
460 # Here we are trading a small amount of extra data transfer
461 # (just getting the whole doc) in order to simplify our code.
462 # Since individual docs are all tiny, in practice, this should not have
463 # any performance consequence, as other sources of latency are much, much higher.
--> 464 await self._get_doc(path)
465 return True
466 except DocumentNotFoundError:
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\arraylake_client\repo.py:445, in AsyncRepo._get_doc(self, path)
432 async def _get_doc(self, path: Path) -> Mapping[Path, dict]:
433 """Get a single document from the metastore
434
435 Parameters
(...)
443 Document contents
444 """
--> 445 result = await self._get_docs([path])
446 try:
447 return result[path]
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\arraylake_client\repo.py:483, in AsyncRepo._get_docs(self, paths)
470 """Get multiple documents from the metastore
471
472 Parameters
(...)
480 Mapping where keys are document paths and values are documents in the form of dictionaries.
481 """
482 # Here we do what fsspec does and just OMIT the missing paths from the dictionary
--> 483 db_results = {
484 doc.path: doc.content
485 async for doc in self.db.get_docs(
486 paths, collection=metadata_collection, session_id=self.session["id"], commit_id=self.session["base_commit"]
487 )
488 }
489 return db_results
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\arraylake_client\repo.py:483, in <dictcomp>(.0)
470 """Get multiple documents from the metastore
471
472 Parameters
(...)
480 Mapping where keys are document paths and values are documents in the form of dictionaries.
481 """
482 # Here we do what fsspec does and just OMIT the missing paths from the dictionary
--> 483 db_results = {
484 doc.path: doc.content
485 async for doc in self.db.get_docs(
486 paths, collection=metadata_collection, session_id=self.session["id"], commit_id=self.session["base_commit"]
487 )
488 }
489 return db_results
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\arraylake_client\metastore\http_metastore.py:264, in HttpMetastoreDatabase.get_docs(self, paths, collection, session_id, commit_id)
258 async def get_docs(
259 self, paths: Sequence[Path], *, collection: CollectionName, session_id: SessionID, commit_id: Optional[CommitID] = None
260 ) -> AsyncGenerator[DocResponse, None]:
261 # remove dupes from request; is there a cheaper way of doing this? seems like a lot of overhead for every call
262 paths = list(set(paths))
--> 264 results = await asyncio.gather(
265 *(
266 self._get_docs(paths_batch, collection, session_id=session_id, commit_id=commit_id)
267 for paths_batch in chunks(paths, BATCH_SIZE)
268 )
269 )
271 for result in results:
272 for doc in result:
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\arraylake_client\metastore\http_metastore.py:255, in HttpMetastoreDatabase._get_docs(self, paths, collection, session_id, commit_id)
253 params = {"session_id": session_id, "commit_id": str(commit_id) if commit_id else None}
254 response = await self._request("POST", f"{self._repo_path}/contents/{collection}/_bulk_get", json=paths, params=params)
--> 255 handle_response(response)
256 return [DocResponse(**item) for item in response.json()]
File ~\anaconda3\envs\pyvistaxarray\lib\site-packages\arraylake_client\api_utils.py:277, in handle_response(response)
273 # we can consider a 422 an explicit message from the serves that something was invalid but handled about
274 # the user input, and surface this directly to the caller. for other, less clear cases,
275 # return a more complete message including the API url.
276 if exc.response.status_code == 422:
--> 277 raise ValueError(response.json()["detail"])
278 else:
279 raise ValueError(
280 f"Error response {exc.response.status_code} while requesting {exc.request.url!r}. {response}: {response.read()}"
281 )
ValueError: [{'loc': ['query', 'base_commit'], 'msg': 'field required', 'type': 'value_error.missing'}]
note here that test already existed - I made it via the web a week ago
same error if I make it 'test2' however
@RichardScottOZ - can you post the output from arraylake --diagnostics
? This looks like a version issue.
To your prior point about setting s3 connection parameters: There are a few things you can do:
Use the arraylake config:
from arraylake import Client, config
config.set({"s3": {"aws_secret_access_key": ..., "aws_access_key_id": ..., "region_id": ...})
client = Client()
You can also configure aws to use a different profile using environment variables or using boto3. This might look like:
os.environ['AWS_PROFILE'] = 'default'
# or
boto3.setup_default_session(profile_name='default')
client = Client()
...
See this SO post for more details.
Try 'arraylake -h' for help.
+- Error ---------------------------------------------------------------------+
| No such option: --diagnostics |
+-----------------------------------------------------------------------------+
I upgraded python so now
arraylake-client 0.6.0 pyhd8ed1ab_0 conda-forge and python 3.9.18
same error
Trying an upgrade for arraylake-client
conda-forge/win-64 Using cache
conda-forge/noarch Using cache
pkgs/r/win-64 No change
pkgs/main/noarch No change
pkgs/r/noarch No change
pkgs/main/win-64 No change
pkgs/msys2/win-64 No change
pkgs/msys2/noarch No change
Pinned packages:
- python 3.9.*
Could not solve for environment specs
Encountered problems while solving:
- nothing provides uvloop >=0.17,<1 needed by arraylake-client-0.7.0-pyhd8ed1ab_0
The environment can't be solved, aborting the operation
and no windows version https://anaconda.org/conda-forge/uvloop
so failure
@RichardScottOZ - it looks like conda is not finding a recent version of arraylake for your environment. Would you mind trying pip:
pip install arraylake
If that is not an option, try:
conda install -v arraylake
that won't make any difference if no uvloop for windows will it - or have you packaged something? happy to try whatever though
I likely won't get a chance to try on linux until tomorrow
info libmamba Problem count: 1
Could not solve for environment specs
Encountered problems while solving:
- nothing provides __unix needed by arraylake-0.7.2-pyhd8ed1ab_0
The environment can't be solved, aborting the operation
info libmamba Freeing solver.
info libmamba Freeing pool.
uvloop
is an optional dependency of arraylake. It shouldn't be showing up as a dependency on windows at all.
It seems like our conda-forge configuration isn't quite right. We can address this tomorrow. In the meantime, I suggest trying out pip
.
(pyvistaxarray) C:\Users\rnmsc>pip install uvloop
Collecting uvloop
Downloading uvloop-0.19.0.tar.gz (2.3 MB)
---------------------------------------- 2.3/2.3 MB 825.3 kB/s eta 0:00:00
Installing build dependencies ... done
Getting requirements to build wheel ... error
error: subprocess-exited-with-error
× Getting requirements to build wheel did not run successfully.
│ exit code: 1
╰─> [15 lines of output]
Traceback (most recent call last):
File "C:\Users\rnmsc\anaconda3\envs\pyvistaxarray\lib\site-packages\pip\_vendor\pep517\in_process\_in_process.py", line 351, in <module>
main()
File "C:\Users\rnmsc\anaconda3\envs\pyvistaxarray\lib\site-packages\pip\_vendor\pep517\in_process\_in_process.py", line 333, in main
json_out['return_val'] = hook(**hook_input['kwargs'])
File "C:\Users\rnmsc\anaconda3\envs\pyvistaxarray\lib\site-packages\pip\_vendor\pep517\in_process\_in_process.py", line 118, in get_requires_for_build_wheel
return hook(config_settings)
File "C:\Users\rnmsc\AppData\Local\Temp\pip-build-env-cr_2mbbo\overlay\Lib\site-packages\setuptools\build_meta.py", line 355, in get_requires_for_build_wheel
return self._get_build_requires(config_settings, requirements=['wheel'])
File "C:\Users\rnmsc\AppData\Local\Temp\pip-build-env-cr_2mbbo\overlay\Lib\site-packages\setuptools\build_meta.py", line 325, in _get_build_requires
self.run_setup()
File "C:\Users\rnmsc\AppData\Local\Temp\pip-build-env-cr_2mbbo\overlay\Lib\site-packages\setuptools\build_meta.py", line 341, in run_setup
exec(code, locals())
File "<string>", line 8, in <module>
RuntimeError: uvloop does not support Windows at the moment
[end of output]
note: This error originates from a subprocess, and is likely not a problem with pip.
error: subprocess-exited-with-error
× Getting requirements to build wheel did not run successfully.
│ exit code: 1
╰─> See above for output.
when I tried yesterday
I will give the pip suggestion a go now re: arraylake
Ok so that installed, anyway.
UserDiagnostics(
C:\Users\rnmsc\anaconda3\envs\pyvistaxarray\lib\site-packages\arraylake\config.py:61: UserWarning: Migrated C:\Users\rnmsc\.config\arraylake_client\config.yaml to C:\Users\rnmsc\.config\arraylake\config.yaml.
handle_rename()
system={
'python': '3.9.18 | packaged by conda-forge | (main, Aug 30 2023,
03:40:31) [MSC v.1929 64 bit (AMD64)]',
'python-bits': '64',
'OS': 'Windows',
'OS-release': '10',
'machine': 'AMD64',
'processor': 'Intel64 Family 6 Model 165 Stepping 2, GenuineIntel',
'byteorder': 'little',
'LC_ALL': 'None',
'LANG': 'None',
'LOCALE': "('English_Australia', '1252')"
},
versions={
'arraylake': '0.7.2',
'aiobotocore': '2.7.0',
'uvloop': 'none',
'zarr': '2.16.0',
'numcodecs': '0.12.1',
'numpy': '1.26.0',
'donfig': '0.8.1.post0',
'pydantic': '1.10.13',
'httpx': '0.25.0',
'ruamel.yaml': '0.18.2',
'typer': '0.9.0',
'rich': 'installed',
'fsspec': '2023.10.0',
'kerchunk': 'none',
'h5py': 'none',
's3fs': 'none',
'cachetools': '5.3.2',
'structlog': '23.2.0',
'ipytree': 'none',
'xarray': '2023.10.1',
'dateutil': '2.8.2',
'click': '8.1.3',
'dask': '2023.5.0',
'distributed': '2023.5.0'
},
config={'chunkstore.hash_method': 'hashlib.sha256'},
service={
'service_uri': 'https://api.earthmover.io',
'service_version': '0.7.1.post23.dev0+f29772d.dirty'
}
)
Huzza! this all looks very healthy! You should be good to go now.
One possible wrinkle - arraylake and arraylake_client both in environment now after the messing around - can that cause a problem?
arraylake 0.7.2 pypi_0 pypi
arraylake-client 0.6.0 pyhd8ed1ab_0 conda-forge
This should be fine. Use arraylake
from now on. 0.7.2 is the release we made on Friday and is the package name you should use going forward. If you like, you can uninstall the arraylake-client
package but that is not strictly required.
And yes, your import should now be:
import arraylake
some of the previous tests actually created repos too it seemed - hadn't looked until now
and s3 I have this:
PRE arraylake/
2023-10-30 13:29:28 365528 arraylake04a3a5a41a3bc01d186bae0dc902b47cab25270b816dafa72f5ca084b3892b16
2023-10-30 13:29:28 397127 arraylake16d2dd0c841ac6c29998d8ef934c2ef2bcf3a9f86f6a4b7c9282bb78f4278280
2023-10-30 13:29:28 403625 arraylake181272f95f1b3df932ce1ac39819ea16ef63073d9b619cac15c870b789120341
2023-10-30 13:29:14 118 arraylake1ebde77055aee62e923675a9bb49373b2e5784f332e485abf08f64b2303a066b
2023-10-30 13:29:28 396393 arraylake549fb77d73eb51b88b36c7fb8a6c8e37ce1b59df9f32033317c7d0b0b9e8bc5b
2023-10-30 13:29:33 369757 arraylake5809db856e5717203676b3ccbd640d19aa1ff97d341e94e00d3f880403dd7460
2023-10-30 13:29:09 116 arraylake6c8fa202409c18d8a55539b824d81cc43bd9f017f1a60ff3ca684136d8ee6f53
2023-10-30 13:29:34 364554 arraylake7ded0b020e0a36963bdab9a567c9a94de7d07788bbdc1d66f790b0354eb7052a
2023-10-30 13:29:28 370434 arraylake845b90e07d3f3399278407aae44949e60b44a1fbdfdb4d3d1822c30d6c8db54f
2023-10-30 13:29:34 348643 arraylake87a8176b69e99f83f0af929c58abc7912f415be92e71fff921a97ca14882e125
2023-10-30 13:29:34 394148 arraylake97589f482e8701b2fdd8e5203832396dbd409cb9f36e6dc7b593614887787cb5
2023-10-30 13:29:28 347419 arraylakebd1915579583361ef465ad8d8b733b16a03474f7bc580ab02f535853340558e8
2023-10-30 13:29:28 409236 arraylakec1077dca18dbefa136006a398e0c22951f0abbb7fbe5b50c6da6654d1a6571a3
2023-10-30 13:29:28 352805 arraylakec4fdea5d27215f5c3bc80902d5d4b0cda598627f1ef177d9b1a9ff2e67ffd4f4
2023-10-30 13:29:28 408193 arraylakecead9997448adbabfed395429f938a78736c0b6b3586ab22d2a9403373991b56
2023-10-30 13:29:34 403435 arraylaked8c4de9faa221a75347d37edfe99944a3de3cb3e1952c31ceef223c55291d8e6
2023-10-30 13:29:33 355910 arraylaked967c09dff054fce88400013d247389bc93c00613970bba760da16fb54948dd6
2023-10-30 13:29:28 398570 arraylakee0e863980a465d40c3131a22eb3a306d5e19883793c5ca8a2ca642dea94e7123
2023-10-30 13:29:18 2549 arraylakefa8c1c7cfa079f1e425b33b81f7aeb9994be8bbca85347c49212ee35845121ea
So how would you explain that to the layperson who is not us?
So how would you explain that to the layperson who is not us?
What would we be explaining? The object names in your S3 bucket?
I think we have fixed our windows conda-forge build problem. Would you mind testing it out please?
So how would you explain that to the layperson who is not us?
What would we be explaining? The object names in your S3 bucket?
Not mine specifically, but it would be fine as an example - as in when you do this you will expect to see X, this is why. More layperson than people in this conversation.
I think we have fixed our windows conda-forge build problem. Would you mind testing it out please?
Sure, hopefully get to it shortly.
In addition to to this setting, AWS credentials with appropriate write access to your target S3 bucket should be available in your environment. - so is this saying they have to be environment variables?