Open kmika11 opened 1 month ago
@kmika11, thanks for opening the issue! Could you share the error message and traceback so we can pinpoint where the error is occurring?
Here's the text from the traceback and error message. Let me know if screenshots would be better. thanks for looking into this!
---------------------------------------------------------------------------
ClientResponseError Traceback (most recent call last)
Cell In[19], line 4
2 collection_pids = []
3 for dataset_name, dataset in g_dataverse_dataset_info.items():
----> 4 pid = dataset.upload(dataverse_name = g_dataverse_collection, n_parallel=4)
5 collection_pids.append(pid)
File ~/anaconda3/envs/curation/lib/python3.12/site-packages/easyDataverse/dataset.py:216, in Dataset.upload(self, dataverse_name, n_parallel)
204 """Uploads a given dataset to a Dataverse installation specified in the environment variable.
205
206 Args:
(...)
211 str: The identifier of the uploaded dataset.
212 """
214 self._validate_required_fields()
--> 216 self.p_id = upload_to_dataverse(
217 json_data=self.dataverse_json(),
218 dataverse_name=dataverse_name,
219 files=self.files,
220 p_id=self.p_id,
221 DATAVERSE_URL=str(self.DATAVERSE_URL),
222 API_TOKEN=str(self.API_TOKEN),
223 n_parallel=n_parallel,
224 )
226 return self.p_id
File ~/anaconda3/envs/curation/lib/python3.12/site-packages/easyDataverse/uploader.py:59, in upload_to_dataverse(json_data, dataverse_name, files, p_id, n_parallel, DATAVERSE_URL, API_TOKEN)
56 # Get response data
57 p_id = response.json()["data"]["persistentId"]
---> 59 _uploadFiles(
60 files=files,
61 p_id=p_id,
62 api=api,
63 n_parallel=n_parallel,
64 ) # type: ignore
66 console = Console()
67 url = urljoin(DATAVERSE_URL, f"dataset.xhtml?persistentId={p_id}")
File ~/anaconda3/envs/curation/lib/python3.12/site-packages/easyDataverse/uploader.py:107, in _uploadFiles(files, p_id, api, n_parallel)
104 return
106 dvuploader = DVUploader(files=files)
--> 107 dvuploader.upload(
108 persistent_id=p_id,
109 dataverse_url=api.base_url,
110 api_token=api.api_token,
111 n_parallel_uploads=n_parallel,
112 )
File ~/anaconda3/envs/curation/lib/python3.12/site-packages/dvuploader/dvuploader.py:133, in DVUploader.upload(self, persistent_id, dataverse_url, api_token, n_parallel_uploads, force_native)
131 else:
132 with progress:
--> 133 asyncio.run(
134 direct_upload(
135 files=files,
136 dataverse_url=dataverse_url,
137 api_token=api_token,
138 persistent_id=persistent_id,
139 pbars=pbars,
140 progress=progress,
141 n_parallel_uploads=n_parallel_uploads,
142 )
143 )
145 if self.verbose:
146 rich.print("\n[bold italic white]✅ Upload complete\n")
File ~/anaconda3/envs/curation/lib/python3.12/site-packages/nest_asyncio.py:30, in _patch_asyncio.<locals>.run(main, debug)
28 task = asyncio.ensure_future(main)
29 try:
---> 30 return loop.run_until_complete(task)
31 finally:
32 if not task.done():
File ~/anaconda3/envs/curation/lib/python3.12/site-packages/nest_asyncio.py:98, in _patch_loop.<locals>.run_until_complete(self, future)
95 if not f.done():
96 raise RuntimeError(
97 'Event loop stopped before Future completed.')
---> 98 return f.result()
File ~/anaconda3/envs/curation/lib/python3.12/asyncio/futures.py:203, in Future.result(self)
201 self.__log_traceback = False
202 if self._exception is not None:
--> 203 raise self._exception.with_traceback(self._exception_tb)
204 return self._result
File ~/anaconda3/envs/curation/lib/python3.12/asyncio/tasks.py:314, in Task.__step_run_and_handle_result(***failed resolving arguments***)
310 try:
311 if exc is None:
312 # We use the `send` method directly, because coroutines
313 # don't have `__iter__` and `__next__` methods.
--> 314 result = coro.send(None)
315 else:
316 result = coro.throw(exc)
File ~/anaconda3/envs/curation/lib/python3.12/site-packages/dvuploader/directupload.py:91, in direct_upload(files, dataverse_url, api_token, persistent_id, progress, pbars, n_parallel_uploads)
86 connector = aiohttp.TCPConnector(limit=2)
87 async with aiohttp.ClientSession(
88 headers=headers,
89 connector=connector,
90 ) as session:
---> 91 await _add_files_to_ds(
92 session=session,
93 files=files,
94 dataverse_url=dataverse_url,
95 pid=persistent_id,
96 progress=progress,
97 pbar=pbar,
98 )
File ~/anaconda3/envs/curation/lib/python3.12/site-packages/dvuploader/directupload.py:520, in _add_files_to_ds(session, dataverse_url, pid, files, progress, pbar)
512 replace_json_data = _prepare_registration(files, use_replace=True)
514 await _multipart_json_data_request(
515 session=session,
516 json_data=novel_json_data,
517 url=novel_url,
518 )
--> 520 await _multipart_json_data_request(
521 session=session,
522 json_data=replace_json_data,
523 url=replace_url,
524 )
526 progress.update(pbar, advance=1)
File ~/anaconda3/envs/curation/lib/python3.12/site-packages/dvuploader/directupload.py:581, in _multipart_json_data_request(json_data, url, session)
578 json_part.set_content_disposition("form-data", name="jsonData")
580 async with session.post(url, data=writer) as response:
--> 581 response.raise_for_status()
File ~/anaconda3/envs/curation/lib/python3.12/site-packages/aiohttp/client_reqrep.py:1070, in ClientResponse.raise_for_status(self)
1068 assert self.reason is not None
1069 self.release()
-> 1070 raise ClientResponseError(
1071 self.request_info,
1072 self.history,
1073 status=self.status,
1074 message=self.reason,
1075 headers=self.headers,
1076 )
ClientResponseError: 500, message='Internal Server Error', url=URL('https://demo.dataverse.org/api/datasets/:persistentId/replaceFiles?persistentId=doi:10.70122/FK2/K1KQBH')
I have experienced this exact error as well when attempting to upload large zip files (~100GB) with the CLI
When uploading files, I receive a 500 error during the "registering files" step of the upload process. Files are "checked" successfully, uploaded successfully, and then the dvuploader hangs as it is "registering files" before resulting in a 500 error. However, files appear to upload and get added to the dataset with no problems.