Open raybellwaves opened 4 years ago
Have you tried fs.get(rpath, lpath, recursive=True)?
On Oct 5, 2020, at 11:16 AM, Ray Bell notifications@github.com wrote:
I have a partitioned parquet file in Blob Storage. I can copy it to local using a wildcard with get. Is there syntax to grab the folder without having to specify the wildcard?
from adlfs import AzureBlobFileSystem storage_options = {"account_name": "ACCOUNT_NAME", "account_key": "ACCOUNT_KEY"} fs = AzureBlobFileSystem(*storage_options) fs.get("file.parquet/", "file.parquet") For reference if I remove the wildcard I get
fs.get("file.parquet", "file.parquet")
FileNotFoundError Traceback (most recent call last)
in ----> 1 fs.get(remote_folder + "voyage_vcap_perk_obr.parquet", 'tmp.parquet') ~\AppData\Local\Continuum\anaconda3\envs\env\lib\site-packages\fsspec\asyn.py in get(self, rpath, lpath, recursive, **kwargs) 276 lpaths = other_paths(rpaths, lpath) 277 [os.makedirs(os.path.dirname(lp), exist_ok=True) for lp in lpaths] --> 278 return sync(self.loop, self._get, rpaths, lpaths) 279 280 ~\AppData\Local\Continuum\anaconda3\envs\env\lib\site-packages\fsspec\asyn.py in sync(loop, func, callback_timeout, *args, **kwargs) 66 if error[0]: 67 typ, exc, tb = error[0] ---> 68 raise exc.with_traceback(tb) 69 else: 70 return result[0] ~\AppData\Local\Continuum\anaconda3\envs\env\lib\site-packages\fsspec\asyn.py in f() 50 if callback_timeout is not None: 51 future = asyncio.wait_for(future, callback_timeout) ---> 52 result[0] = await future 53 except Exception: 54 error[0] = sys.exc_info() ~\AppData\Local\Continuum\anaconda3\envs\env\lib\site-packages\fsspec\asyn.py in _get(self, rpaths, lpaths, **kwargs) 261 dirs = [os.path.dirname(lp) for lp in lpaths] 262 [os.makedirs(d, exist_ok=True) for d in dirs] --> 263 return await asyncio.gather( 264 *[ 265 self._get_file(rpath, lpath, **kwargs) ~\AppData\Local\Continuum\anaconda3\envs\env\lib\site-packages\adlfs\spec.py in _get_file(self, rpath, lpath, recursive, delimiter, **kwargs) 1193 print(files) 1194 if rpath not in files: -> 1195 raise FileNotFoundError 1196 container_name, path = self.split_path(rpath, delimiter=delimiter) 1197 try: FileNotFoundError: — You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub, or unsubscribe.
Have you tried fs.get(rpath, lpath, recursive=True)?
Gives the same output
fs.get("file.parquet", "tmp.parquet", recursive=True)
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-9-54a58194e8e7> in <module>
----> 1 fs.get(remote_folder + "voyage_vcap_perk_obr.parquet",
2 'tmp.parquet',
3 recursive=True)
~\AppData\Local\Continuum\anaconda3\envs\env\lib\site-packages\fsspec\asyn.py in get(self, rpath, lpath, recursive, **kwargs)
276 lpaths = other_paths(rpaths, lpath)
277 [os.makedirs(os.path.dirname(lp), exist_ok=True) for lp in lpaths]
--> 278 return sync(self.loop, self._get, rpaths, lpaths)
279
280
~\AppData\Local\Continuum\anaconda3\envs\env\lib\site-packages\fsspec\asyn.py in sync(loop, func, callback_timeout, *args, **kwargs)
66 if error[0]:
67 typ, exc, tb = error[0]
---> 68 raise exc.with_traceback(tb)
69 else:
70 return result[0]
~\AppData\Local\Continuum\anaconda3\envs\env\lib\site-packages\fsspec\asyn.py in f()
50 if callback_timeout is not None:
51 future = asyncio.wait_for(future, callback_timeout)
---> 52 result[0] = await future
53 except Exception:
54 error[0] = sys.exc_info()
~\AppData\Local\Continuum\anaconda3\envs\env\lib\site-packages\fsspec\asyn.py in _get(self, rpaths, lpaths, **kwargs)
261 dirs = [os.path.dirname(lp) for lp in lpaths]
262 [os.makedirs(d, exist_ok=True) for d in dirs]
--> 263 return await asyncio.gather(
264 *[
265 self._get_file(rpath, lpath, **kwargs)
~\AppData\Local\Continuum\anaconda3\envs\env\lib\site-packages\adlfs\spec.py in _get_file(self, rpath, lpath, recursive, delimiter, **kwargs)
1193 print(files)
1194 if rpath not in files:
-> 1195 raise FileNotFoundError
1196 container_name, path = self.split_path(rpath, delimiter=delimiter)
1197 try:
FileNotFoundError:
PR #121 fixes an error in the recursive flag. But currently, you have to get a client for a specific blob, which mandates the *, at least for files with a single container. The logic can be added of course. Does that present an issue?
A MCVE is
from adlfs import AzureBlobFileSystem
storage_options = {'account_name': 'goes'}
fs = AzureBlobFileSystem(**storage_options)
fs.get('noaa-goes16/ABI-L2-MCMIPF/2020/001/00', '00', recursive=True)
This returns
>>> fs.get('noaa-goes16/ABI-L2-MCMIPF/2020/001/00', '00', recursive=True)
['noaa-goes16/ABI-L2-MCMIPF/2020/001/00/OR_ABI-L2-MCMIPF-M6_G16_s20200010000216_e20200010009536_c20200010010028.nc', 'noaa-goes16/ABI-L2-MCMIPF/2020/001/00/OR_ABI-L2-MCMIPF-M6_G16_s20200010010216_e20200010019535_c20200010020037.nc', 'noaa-goes16/ABI-L2-MCMIPF/2020/001/00/OR_ABI-L2-MCMIPF-M6_G16_s20200010020216_e20200010029536_c20200010030028.nc', 'noaa-goes16/ABI-L2-MCMIPF/2020/001/00/OR_ABI-L2-MCMIPF-M6_G16_s20200010030216_e20200010039530_c20200010040030.nc', 'noaa-goes16/ABI-L2-MCMIPF/2020/001/00/OR_ABI-L2-MCMIPF-M6_G16_s20200010040216_e20200010049530_c20200010050040.nc', 'noaa-goes16/ABI-L2-MCMIPF/2020/001/00/OR_ABI-L2-MCMIPF-M6_G16_s20200010050216_e20200010059524_c20200010100040.nc']
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Users/Ray/miniconda3/envs/main/lib/python3.8/site-packages/fsspec/asyn.py", line 278, in get
return sync(self.loop, self._get, rpaths, lpaths)
File "/Users/Ray/miniconda3/envs/main/lib/python3.8/site-packages/fsspec/asyn.py", line 68, in sync
raise exc.with_traceback(tb)
File "/Users/Ray/miniconda3/envs/main/lib/python3.8/site-packages/fsspec/asyn.py", line 52, in f
result[0] = await future
File "/Users/Ray/miniconda3/envs/main/lib/python3.8/site-packages/fsspec/asyn.py", line 263, in _get
return await asyncio.gather(
File "/Users/Ray/miniconda3/envs/main/lib/python3.8/site-packages/adlfs/spec.py", line 1195, in _get_file
raise FileNotFoundError
FileNotFoundError
>>> ['noaa-goes16/ABI-L2-MCMIPF/2020/001/00/OR_ABI-L2-MCMIPF-M6_G16_s20200010000216_e20200010009536_c20200010010028.nc']
['noaa-goes16/ABI-L2-MCMIPF/2020/001/00/OR_ABI-L2-MCMIPF-M6_G16_s20200010020216_e20200010029536_c20200010030028.nc']
['noaa-goes16/ABI-L2-MCMIPF/2020/001/00/OR_ABI-L2-MCMIPF-M6_G16_s20200010050216_e20200010059524_c20200010100040.nc']
['noaa-goes16/ABI-L2-MCMIPF/2020/001/00/OR_ABI-L2-MCMIPF-M6_G16_s20200010040216_e20200010049530_c20200010050040.nc']
['noaa-goes16/ABI-L2-MCMIPF/2020/001/00/OR_ABI-L2-MCMIPF-M6_G16_s20200010010216_e20200010019535_c20200010020037.nc']
['noaa-goes16/ABI-L2-MCMIPF/2020/001/00/OR_ABI-L2-MCMIPF-M6_G16_s20200010030216_e20200010039530_c20200010040030.nc']
It seems to create the files in the folder but they are empty.
actually it does seem to work in the background
Have to find a smaller folder
Following up. Is this an issue?
Following up. Is this an issue?
I believe so. But more of an fsspec issue. The linked s3fs issue has more discussion on this.
I have a partitioned parquet file in Blob Storage. I can copy it to local using a wildcard with
get
. Is there syntax to grab the folder without having to specify the wildcard?For reference if I remove the wildcard I get