voxel51 / fiftyone

The open-source tool for building high-quality datasets and computer vision models
Apache License 2.0
7.89k stars 518 forks source link

[BUG] Errors while uploading large amounts of files to a label-studio #4424

Open charitarthchugh opened 1 month ago

charitarthchugh commented 1 month ago

Describe the problem

Whenever I try to upload a large amount of images to a local label-studio instance (in my case I need to upload 10k images), I face two errors: A timeout error, and a validation error when a project with the same name already exists on LabelStudio (because trying to do another attempt)

Code to reproduce issue

import fiftyone as fo
from fiftyone.utils import huggingface as fouh
dataset = fouh.load_dataset("Charitarth/dac-sdc-2024-coco-fiftyone")
view = dataset.load_saved_view("bbox_only")
anno_key = "labelstudio_basic_recipe"

label_schema = {
    "new_ground_truth": {
        "type": "detections",
        "classes": dataset.distinct("detections.detections.label"),

annot_run = view.annotate(


System information

Other info/logs

Timeout Error

TimeoutError                              Traceback (most recent call last)
File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:537, in HTTPConnectionPool._make_request(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)
    536 try:
--> 537     response = conn.getresponse()
    538 except (BaseSSLError, OSError) as e:

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/urllib3/connection.py:466, in HTTPConnection.getresponse(self)
    465 # Get the response from http.client.HTTPConnection
--> 466 httplib_response = super().getresponse()
    468 try:

File ~/.pyenv/versions/3.11.9/lib/python3.11/http/client.py:1395, in HTTPConnection.getresponse(self)
   1394 try:
-> 1395     response.begin()
   1396 except ConnectionError:

File ~/.pyenv/versions/3.11.9/lib/python3.11/http/client.py:325, in HTTPResponse.begin(self)
    324 while True:
--> 325     version, status, reason = self._read_status()
    326     if status != CONTINUE:

File ~/.pyenv/versions/3.11.9/lib/python3.11/http/client.py:286, in HTTPResponse._read_status(self)
    285 def _read_status(self):
--> 286     line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    287     if len(line) > _MAXLINE:

File ~/.pyenv/versions/3.11.9/lib/python3.11/socket.py:706, in SocketIO.readinto(self, b)
    705 try:
--> 706     return self._sock.recv_into(b)
    707 except timeout:

TimeoutError: timed out

The above exception was the direct cause of the following exception:

ReadTimeoutError                          Traceback (most recent call last)
File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/requests/adapters.py:564, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
    563 try:
--> 564     resp = conn.urlopen(
    565         method=request.method,
    566         url=url,
    567         body=request.body,
    568         headers=request.headers,
    569         redirect=False,
    570         assert_same_host=False,
    571         preload_content=False,
    572         decode_content=False,
    573         retries=self.max_retries,
    574         timeout=timeout,
    575         chunked=chunked,
    576     )
    578 except (ProtocolError, OSError) as err:

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:847, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)
    845     new_e = ProtocolError("Connection aborted.", new_e)
--> 847 retries = retries.increment(
    848     method, url, error=new_e, _pool=self, _stacktrace=sys.exc_info()[2]
    849 )
    850 retries.sleep()

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/urllib3/util/retry.py:470, in Retry.increment(self, method, url, response, error, _pool, _stacktrace)
    469 if read is False or method is None or not self._is_method_retryable(method):
--> 470     raise reraise(type(error), error, _stacktrace)
    471 elif read is not None:

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/urllib3/util/util.py:39, in reraise(tp, value, tb)
     38         raise value.with_traceback(tb)
---> 39     raise value
     40 finally:

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:793, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)
    792 # Make the request on the HTTPConnection object
--> 793 response = self._make_request(
    794     conn,
    795     method,
    796     url,
    797     timeout=timeout_obj,
    798     body=body,
    799     headers=headers,
    800     chunked=chunked,
    801     retries=retries,
    802     response_conn=response_conn,
    803     preload_content=preload_content,
    804     decode_content=decode_content,
    805     **response_kw,
    806 )
    808 # Everything went great!

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:539, in HTTPConnectionPool._make_request(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)
    538 except (BaseSSLError, OSError) as e:
--> 539     self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
    540     raise

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:370, in HTTPConnectionPool._raise_timeout(self, err, url, timeout_value)
    369 if isinstance(err, SocketTimeout):
--> 370     raise ReadTimeoutError(
    371         self, url, f"Read timed out. (read timeout={timeout_value})"
    372     ) from err
    374 # See the above comment about EAGAIN in Python 3.

ReadTimeoutError: HTTPConnectionPool(host='localhost', port=8081): Read timed out. (read timeout=180)

During handling of the above exception, another exception occurred:

ReadTimeout                               Traceback (most recent call last)
Cell In[8], line 11
      2 anno_key = "labelstudio_basic_recipe1"
      4 label_schema = {
      5     "new_ground_truth": {
      6         "type": "detections",
      7         "classes": dataset.distinct("detections.detections.label"),
      8     },
      9 }
---> 11 annot_run = dataset.annotate(
     12     anno_key,
     13     backend="labelstudio",
     14     label_schema=label_schema,
     15     launch_editor=True,
     16     url="http://localhost:8081/",
     17     api_key="0e68ae17ba691fea9020b2813a89925116d65f9d"
     19 )

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/fiftyone/core/collections.py:8759, in SampleCollection.annotate(self, anno_key, label_schema, label_field, label_type, classes, attributes, mask_targets, allow_additions, allow_deletions, allow_label_edits, allow_index_edits, allow_spatial_edits, media_field, backend, launch_editor, **kwargs)
   8629 def annotate(
   8630     self,
   8631     anno_key,
   8646     **kwargs,
   8647 ):
   8648     """Exports the samples and optional label field(s) in this collection
   8649     to the given annotation backend.
   8757         an :class:`fiftyone.utils.annotations.AnnnotationResults`
   8758     """
-> 8759     return foua.annotate(
   8760         self,
   8761         anno_key,
   8762         label_schema=label_schema,
   8763         label_field=label_field,
   8764         label_type=label_type,
   8765         classes=classes,
   8766         attributes=attributes,
   8767         mask_targets=mask_targets,
   8768         allow_additions=allow_additions,
   8769         allow_deletions=allow_deletions,
   8770         allow_label_edits=allow_label_edits,
   8771         allow_index_edits=allow_index_edits,
   8772         allow_spatial_edits=allow_spatial_edits,
   8773         media_field=media_field,
   8774         backend=backend,
   8775         launch_editor=launch_editor,
   8776         **kwargs,
   8777     )

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/fiftyone/utils/annotations.py:250, in annotate(samples, anno_key, label_schema, label_field, label_type, classes, attributes, mask_targets, allow_additions, allow_deletions, allow_label_edits, allow_index_edits, allow_spatial_edits, media_field, backend, launch_editor, **kwargs)
    245 # Don't allow overwriting an existing run with same `anno_key`, since we
    246 # need the existing run in order to perform workflows like automatically
    247 # cleaning up the backend's tasks
    248 anno_backend.register_run(samples, anno_key, overwrite=False)
--> 250 results = anno_backend.upload_annotations(
    251     samples, anno_key, launch_editor=launch_editor
    252 )
    254 anno_backend.save_run_results(samples, anno_key, results)
    256 return results

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/fiftyone/utils/labelstudio.py:145, in LabelStudioBackend.upload_annotations(self, samples, anno_key, launch_editor)
    142 api = self.connect_to_api()
    144 logger.info("Uploading media to Label Studio...")
--> 145 results = api.upload_samples(samples, anno_key, self)
    146 logger.info("Upload complete")
    148 if launch_editor:

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/fiftyone/utils/labelstudio.py:458, in LabelStudioAnnotationAPI.upload_samples(self, samples, anno_key, backend)
    452 # @todo can we add support for uploading tasks in batches?
    453 tasks, predictions, id_map = self._prepare_tasks(
    454     samples,
    455     config.label_schema,
    456     config.media_field,
    457 )
--> 458 uploaded_tasks = self._upload_tasks(project, tasks, predictions)
    460 return LabelStudioAnnotationResults(
    461     samples,
    462     config,
    467     backend=backend,
    468 )

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/fiftyone/utils/labelstudio.py:335, in LabelStudioAnnotationAPI._upload_tasks(self, project, tasks, predictions)
    329 files = [
    330     (one["source_id"], open(one[one["media_type"]], "rb"))
    331     for one in tasks
    332 ]
    334 # upload files first and get their upload ids
--> 335 upload_resp = self._client.make_request(
    336     "POST",
    337     f"/api/projects/{project.id}/import",
    338     params={"commit_to_project": True},
    339     files=files,
    340 )
    342 # create tasks out of the uploaded files
    343 payload = json.dumps(
    344     {
    345         "file_upload_ids": upload_resp.json()["file_upload_ids"],
    346         "files_as_tasks_list": False,
    347     }
    348 )

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/label_studio_sdk/client.py:436, in Client.make_request(self, method, url, *args, **kwargs)
    433     raise_exceptions = kwargs.pop("raise_exceptions")
    435 logger.debug(f"{method}: {url} with args={args}, kwargs={kwargs}")
--> 436 response = self.session.request(
    437     method,
    438     self.get_url(url),
    439     headers=self.headers,
    440     cookies=self.cookies,
    441     *args,
    442     **kwargs,
    443 )
    445 if raise_exceptions:
    446     if response.status_code >= 400:

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    584 send_kwargs = {
    585     "timeout": timeout,
    586     "allow_redirects": allow_redirects,
    587 }
    588 send_kwargs.update(settings)
--> 589 resp = self.send(prep, **send_kwargs)
    591 return resp

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs)
    700 start = preferred_clock()
    702 # Send the request
--> 703 r = adapter.send(request, **kwargs)
    705 # Total elapsed time of the request (approximately)
    706 elapsed = preferred_clock() - start

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/requests/adapters.py:610, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
    608     raise SSLError(e, request=request)
    609 elif isinstance(e, ReadTimeoutError):
--> 610     raise ReadTimeout(e, request=request)
    611 elif isinstance(e, _InvalidHeader):
    612     raise InvalidHeader(e, request=request)

ReadTimeout: HTTPConnectionPool(host='localhost', port=8081): Read timed out. (read timeout=180)

Validation Error when dataset already has a corresponding project in LabelStudio

Request URL: http://localhost:8081/api/projects
Response status code: 400
Response content:
  "id": "52b07b5c-84b8-4f11-b5eb-81d187221c7a",
  "status_code": 400,
  "version": "1.12.1",
  "detail": "Validation error",
  "exc_info": null,
  "validation_errors": {
    "title": [
      "Ensure this field has no more than 50 characters."
HTTPError                                 Traceback (most recent call last)
Cell In[6], line 11
      2 anno_key = "labelstudio_basic_recipe1"
      4 label_schema = {
      5     "new_ground_truth": {
      6         "type": "detections",
      7         "classes": dataset.distinct("detections.detections.label"),
      8     },
      9 }
---> 11 annot_run = view.annotate(
     12     anno_key,
     13     backend="labelstudio",
     14     label_schema=label_schema,
     15     launch_editor=True,
     16     url="http://localhost:8081/",
     17     api_key="0e68ae17ba691fea9020b2813a89925116d65f9d"
     19 )

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/fiftyone/core/collections.py:8759, in SampleCollection.annotate(self, anno_key, label_schema, label_field, label_type, classes, attributes, mask_targets, allow_additions, allow_deletions, allow_label_edits, allow_index_edits, allow_spatial_edits, media_field, backend, launch_editor, **kwargs)
   8629 def annotate(
   8630     self,
   8631     anno_key,
   8646     **kwargs,
   8647 ):
   8648     """Exports the samples and optional label field(s) in this collection
   8649     to the given annotation backend.
   8757         an :class:`fiftyone.utils.annotations.AnnnotationResults`
   8758     """
-> 8759     return foua.annotate(
   8760         self,
   8761         anno_key,
   8762         label_schema=label_schema,
   8763         label_field=label_field,
   8764         label_type=label_type,
   8765         classes=classes,
   8766         attributes=attributes,
   8767         mask_targets=mask_targets,
   8768         allow_additions=allow_additions,
   8769         allow_deletions=allow_deletions,
   8770         allow_label_edits=allow_label_edits,
   8771         allow_index_edits=allow_index_edits,
   8772         allow_spatial_edits=allow_spatial_edits,
   8773         media_field=media_field,
   8774         backend=backend,
   8775         launch_editor=launch_editor,
   8776         **kwargs,
   8777     )

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/fiftyone/utils/annotations.py:250, in annotate(samples, anno_key, label_schema, label_field, label_type, classes, attributes, mask_targets, allow_additions, allow_deletions, allow_label_edits, allow_index_edits, allow_spatial_edits, media_field, backend, launch_editor, **kwargs)
    245 # Don't allow overwriting an existing run with same `anno_key`, since we
    246 # need the existing run in order to perform workflows like automatically
    247 # cleaning up the backend's tasks
    248 anno_backend.register_run(samples, anno_key, overwrite=False)
--> 250 results = anno_backend.upload_annotations(
    251     samples, anno_key, launch_editor=launch_editor
    252 )
    254 anno_backend.save_run_results(samples, anno_key, results)
    256 return results

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/fiftyone/utils/labelstudio.py:145, in LabelStudioBackend.upload_annotations(self, samples, anno_key, launch_editor)
    142 api = self.connect_to_api()
    144 logger.info("Uploading media to Label Studio...")
--> 145 results = api.upload_samples(samples, anno_key, self)
    146 logger.info("Upload complete")
    148 if launch_editor:

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/fiftyone/utils/labelstudio.py:448, in LabelStudioAnnotationAPI.upload_samples(self, samples, anno_key, backend)
    435 """Uploads the given samples to Label Studio according to the given
    436 backend's annotation and server configuration.
    444     a :class:`LabelStudioAnnotationResults`
    445 """
    446 config = backend.config
--> 448 project = self._init_project(config, samples)
    450 samples.compute_metadata()
    452 # @todo can we add support for uploading tasks in batches?

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/fiftyone/utils/labelstudio.py:232, in LabelStudioAnnotationAPI._init_project(self, config, samples)
    227 # generate label config
    228 label_config = generate_labeling_config(
    229     label_schema, samples.media_type
    230 )
--> 232 project = self._client.start_project(
    233     title=project_name, label_config=label_config
    234 )
    235 return project

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/label_studio_sdk/client.py:240, in Client.start_project(self, **kwargs)
    232 from .project import Project
    234 project = Project(
    235     url=self.url,
    236     api_key=self.api_key,
    237     session=self.session,
    238     versions=self.versions,
    239 )
--> 240 project.start_project(**kwargs)
    241 return project

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/label_studio_sdk/project.py:438, in Project.start_project(self, **kwargs)
    377 def start_project(self, **kwargs):
    378     """Create a new labeling project in Label Studio.
    380     Parameters
    437     """
--> 438     response = self.make_request("POST", "/api/projects", json=kwargs)
    439     if response.status_code == 201:
    440         self.params = response.json()

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/label_studio_sdk/client.py:448, in Client.make_request(self, method, url, *args, **kwargs)
    446     if response.status_code >= 400:
    447         self.log_response_error(response)
--> 448         response.raise_for_status()
    450 return response

File ~/.cache/pypoetry/virtualenvs/dac2024-gpu-CzXcmR37-py3.11/lib/python3.11/site-packages/requests/models.py:1024, in Response.raise_for_status(self)
   1019     http_error_msg = (
   1020         f"{self.status_code} Server Error: {reason} for url: {self.url}"
   1021     )
   1023 if http_error_msg:
-> 1024     raise HTTPError(http_error_msg, response=self)

HTTPError: 400 Client Error: Bad Request for url: http://localhost:8081/api/projects


Willingness to contribute

The FiftyOne Community encourages bug fix contributions. Would you or another member of your organization be willing to contribute a fix for this bug to the FiftyOne codebase?