When running the 'run_prep_artifacts.sh' script for 'es' there is an error when getting the wikipedia dataset. Hugginface does not have a prebuilt dataset for spanish and when line 53 fails in wikipedia_downloader.py and enters the exception, "beam_runner" doesnt seem to be a valid parameter.
If I comment that out, and put a valid date (20230801 is not a valid date anymore):
except Exception as _:
# if that fails, load from original huggingface dataset and process
ds_iterator = load_dataset(
"wikipedia", language=self._lang, date="20240320",
cache_dir=self._cache_dir, #beam_runner="DirectRunner",
split="train"
)
logger.info(f"{str(self)} Load {self._lang}-wiki from 20240320")
I get an error like this:
Traceback (most recent call last): | 0.00/7.40k [00:00<?, ?B/s]
File "/usr/local/lib/python3.11/site-packages/fsspec/implementations/http.py", line 419, in _info
await _file_info(
File "/usr/local/lib/python3.11/site-packages/fsspec/implementations/http.py", line 832, in _file_info
r.raise_for_status()
File "/usr/local/lib/python3.11/site-packages/aiohttp/client_reqrep.py", line 1060, in raise_for_status
raise ClientResponseError(
aiohttp.client_exceptions.ClientResponseError: 404, message='Not Found', url=URL('https://dumps.wikimedia.org/eswiki/20220301/dumpstatus.json')
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/usr/app/src/artifacts/downloaders/wikipedia_downloader.py", line 53, in run
ds_iterator = load_dataset(
^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/datasets/load.py", line 2575, in load_dataset
return builder_instance.as_streaming_dataset(split=split)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/datasets/builder.py", line 1382, in as_streaming_dataset
splits_generators = {sg.name: sg for sg in self._split_generators(dl_manager)}
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.cache/huggingface/modules/datasets_modules/datasets/wikipedia/d41137e149b2ea90eead07e7e3f805119a8c22dd1d5b61651af8e3e3ee736001/wikipedia.py", line 977, in _split_generators
with open(downloaded_files["info"], encoding="utf-8") as f:
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/datasets/streaming.py", line 75, in wrapper
return function(*args, download_config=download_config, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/datasets/download/streaming_download_manager.py", line 512, in xopen
file_obj = fsspec.open(file, mode=mode, *args, **kwargs).open()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/fsspec/core.py", line 135, in open
return self.__enter__()
^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/fsspec/core.py", line 103, in __enter__
f = self.fs.open(self.path, mode=mode)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/fsspec/spec.py", line 1293, in open
f = self._open(
^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/fsspec/implementations/http.py", line 358, in _open
size = size or self.info(path, **kwargs)["size"]
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/fsspec/asyn.py", line 118, in wrapper
return sync(self.loop, func, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/fsspec/asyn.py", line 103, in sync
raise return_result
File "/usr/local/lib/python3.11/site-packages/fsspec/asyn.py", line 56, in _runner
result[0] = await coro
^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/fsspec/implementations/http.py", line 432, in _info
raise FileNotFoundError(url) from exc
FileNotFoundError: https://dumps.wikimedia.org/eswiki/20220301/dumpstatus.json
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/app/src/prep_artifacts.py", line 186, in <module>
main(artifacts_dir=args.artifacts_dir,
File "/usr/app/src/prep_artifacts.py", line 122, in main
wikipedia.run(logger=logger)
File "/usr/app/src/artifacts/downloaders/wikipedia_downloader.py", line 60, in run
ds_iterator = load_dataset(
^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/datasets/load.py", line 2582, in load_dataset
builder_instance.download_and_prepare(
File "/usr/local/lib/python3.11/site-packages/datasets/builder.py", line 1005, in download_and_prepare
self._download_and_prepare(
File "/usr/local/lib/python3.11/site-packages/datasets/builder.py", line 1078, in _download_and_prepare
split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.cache/huggingface/modules/datasets_modules/datasets/wikipedia/d41137e149b2ea90eead07e7e3f805119a8c22dd1d5b61651af8e3e3ee736001/wikipedia.py", line 981, in _split_generators
multistream_dump_info["status"] == "done"
AssertionError: Specified dump (https://dumps.wikimedia.org/eswiki/20240320/) multistream status is not 'done': waiting
When running the 'run_prep_artifacts.sh' script for 'es' there is an error when getting the wikipedia dataset. Hugginface does not have a prebuilt dataset for spanish and when line 53 fails in wikipedia_downloader.py and enters the exception, "beam_runner" doesnt seem to be a valid parameter.
If I comment that out, and put a valid date (20230801 is not a valid date anymore):
I get an error like this: