gdcc / python-dvuploader

✈️ - Python package for parallel direct upload to Dataverse
MIT License
4 stars 0 forks source link

500 error during "registering files" step #18

Open kmika11 opened 1 month ago

kmika11 commented 1 month ago

When uploading files, I receive a 500 error during the "registering files" step of the upload process. Files are "checked" successfully, uploaded successfully, and then the dvuploader hangs as it is "registering files" before resulting in a 500 error. However, files appear to upload and get added to the dataset with no problems.

JR-1991 commented 1 month ago

@kmika11, thanks for opening the issue! Could you share the error message and traceback so we can pinpoint where the error is occurring?

kmika11 commented 1 month ago

Here's the text from the traceback and error message. Let me know if screenshots would be better. thanks for looking into this!

---------------------------------------------------------------------------
ClientResponseError                       Traceback (most recent call last)
Cell In[19], line 4
      2 collection_pids = []
      3 for dataset_name, dataset in g_dataverse_dataset_info.items():
----> 4     pid = dataset.upload(dataverse_name = g_dataverse_collection, n_parallel=4)
      5     collection_pids.append(pid)

File ~/anaconda3/envs/curation/lib/python3.12/site-packages/easyDataverse/dataset.py:216, in Dataset.upload(self, dataverse_name, n_parallel)
    204 """Uploads a given dataset to a Dataverse installation specified in the environment variable.
    205 
    206 Args:
   (...)
    211     str: The identifier of the uploaded dataset.
    212 """
    214 self._validate_required_fields()
--> 216 self.p_id = upload_to_dataverse(
    217     json_data=self.dataverse_json(),
    218     dataverse_name=dataverse_name,
    219     files=self.files,
    220     p_id=self.p_id,
    221     DATAVERSE_URL=str(self.DATAVERSE_URL),
    222     API_TOKEN=str(self.API_TOKEN),
    223     n_parallel=n_parallel,
    224 )
    226 return self.p_id

File ~/anaconda3/envs/curation/lib/python3.12/site-packages/easyDataverse/uploader.py:59, in upload_to_dataverse(json_data, dataverse_name, files, p_id, n_parallel, DATAVERSE_URL, API_TOKEN)
     56 # Get response data
     57 p_id = response.json()["data"]["persistentId"]
---> 59 _uploadFiles(
     60     files=files,
     61     p_id=p_id,
     62     api=api,
     63     n_parallel=n_parallel,
     64 )  # type: ignore
     66 console = Console()
     67 url = urljoin(DATAVERSE_URL, f"dataset.xhtml?persistentId={p_id}")

File ~/anaconda3/envs/curation/lib/python3.12/site-packages/easyDataverse/uploader.py:107, in _uploadFiles(files, p_id, api, n_parallel)
    104     return
    106 dvuploader = DVUploader(files=files)
--> 107 dvuploader.upload(
    108     persistent_id=p_id,
    109     dataverse_url=api.base_url,
    110     api_token=api.api_token,
    111     n_parallel_uploads=n_parallel,
    112 )

File ~/anaconda3/envs/curation/lib/python3.12/site-packages/dvuploader/dvuploader.py:133, in DVUploader.upload(self, persistent_id, dataverse_url, api_token, n_parallel_uploads, force_native)
    131 else:
    132     with progress:
--> 133         asyncio.run(
    134             direct_upload(
    135                 files=files,
    136                 dataverse_url=dataverse_url,
    137                 api_token=api_token,
    138                 persistent_id=persistent_id,
    139                 pbars=pbars,
    140                 progress=progress,
    141                 n_parallel_uploads=n_parallel_uploads,
    142             )
    143         )
    145 if self.verbose:
    146     rich.print("\n[bold italic white]✅ Upload complete\n")

File ~/anaconda3/envs/curation/lib/python3.12/site-packages/nest_asyncio.py:30, in _patch_asyncio.<locals>.run(main, debug)
     28 task = asyncio.ensure_future(main)
     29 try:
---> 30     return loop.run_until_complete(task)
     31 finally:
     32     if not task.done():

File ~/anaconda3/envs/curation/lib/python3.12/site-packages/nest_asyncio.py:98, in _patch_loop.<locals>.run_until_complete(self, future)
     95 if not f.done():
     96     raise RuntimeError(
     97         'Event loop stopped before Future completed.')
---> 98 return f.result()

File ~/anaconda3/envs/curation/lib/python3.12/asyncio/futures.py:203, in Future.result(self)
    201 self.__log_traceback = False
    202 if self._exception is not None:
--> 203     raise self._exception.with_traceback(self._exception_tb)
    204 return self._result

File ~/anaconda3/envs/curation/lib/python3.12/asyncio/tasks.py:314, in Task.__step_run_and_handle_result(***failed resolving arguments***)
    310 try:
    311     if exc is None:
    312         # We use the `send` method directly, because coroutines
    313         # don't have `__iter__` and `__next__` methods.
--> 314         result = coro.send(None)
    315     else:
    316         result = coro.throw(exc)

File ~/anaconda3/envs/curation/lib/python3.12/site-packages/dvuploader/directupload.py:91, in direct_upload(files, dataverse_url, api_token, persistent_id, progress, pbars, n_parallel_uploads)
     86 connector = aiohttp.TCPConnector(limit=2)
     87 async with aiohttp.ClientSession(
     88     headers=headers,
     89     connector=connector,
     90 ) as session:
---> 91     await _add_files_to_ds(
     92         session=session,
     93         files=files,
     94         dataverse_url=dataverse_url,
     95         pid=persistent_id,
     96         progress=progress,
     97         pbar=pbar,
     98     )

File ~/anaconda3/envs/curation/lib/python3.12/site-packages/dvuploader/directupload.py:520, in _add_files_to_ds(session, dataverse_url, pid, files, progress, pbar)
    512 replace_json_data = _prepare_registration(files, use_replace=True)
    514 await _multipart_json_data_request(
    515     session=session,
    516     json_data=novel_json_data,
    517     url=novel_url,
    518 )
--> 520 await _multipart_json_data_request(
    521     session=session,
    522     json_data=replace_json_data,
    523     url=replace_url,
    524 )
    526 progress.update(pbar, advance=1)

File ~/anaconda3/envs/curation/lib/python3.12/site-packages/dvuploader/directupload.py:581, in _multipart_json_data_request(json_data, url, session)
    578 json_part.set_content_disposition("form-data", name="jsonData")
    580 async with session.post(url, data=writer) as response:
--> 581     response.raise_for_status()

File ~/anaconda3/envs/curation/lib/python3.12/site-packages/aiohttp/client_reqrep.py:1070, in ClientResponse.raise_for_status(self)
   1068 assert self.reason is not None
   1069 self.release()
-> 1070 raise ClientResponseError(
   1071     self.request_info,
   1072     self.history,
   1073     status=self.status,
   1074     message=self.reason,
   1075     headers=self.headers,
   1076 )

ClientResponseError: 500, message='Internal Server Error', url=URL('https://demo.dataverse.org/api/datasets/:persistentId/replaceFiles?persistentId=doi:10.70122/FK2/K1KQBH')
lmckone commented 2 weeks ago

I have experienced this exact error as well when attempting to upload large zip files (~100GB) with the CLI