Closed JorisCod closed 1 month ago
created a PR to also retry 500
The relevant part of the actual error, looking at this, we could also translate to something like a 502 "bad gateway" in the backend, and perhaps add a more descriptive message.
File "/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/job_registry.py", line 824, in create_job
ejr_job_info = self.elastic_job_registry.create_job(
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/jobregistry.py", line 409, in create_job
result = self._do_request("POST", "/jobs", json=job_data, expected_status=201)
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/jobregistry.py", line 312, in _do_request
access_token = self._access_token_helper.get_access_token()
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/util/auth.py", line 130, in get_access_token
access_token = self._get_access_token()
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/util/auth.py", line 124, in _get_access_token
tokens = self._authenticator.get_tokens()
File "/opt/openeo/lib/python3.8/site-packages/openeo/rest/auth/oidc.py", line 387, in get_tokens
result = self._do_token_post_request(post_data=self._get_token_endpoint_post_data())
File "/opt/openeo/lib/python3.8/site-packages/openeo/rest/auth/oidc.py", line 405, in _do_token_post_request
resp = self._requests.post(url=token_endpoint, data=post_data)
File "/opt/openeo/lib/python3.8/site-packages/requests/sessions.py", line 637, in post
return self.request("POST", url, data=data, json=json, **kwargs)
merged #625
This line can also use better error handling, there we can probably translate the error code into something that more explicitly states that it's a transient error: https://github.com/Open-EO/openeo-python-driver/blob/7fa3a6b9739bdd018c2d55a6a2beebf75f9065fa/openeo_driver/util/auth.py#L124
The job manager crashed on a connection error. For long-running jobs, that run overnight, a retry has to be added.
Traceback (most recent call last): File "/data/users/Private/joris.c/lcfm-production/notebooks/sentinel1-jm.py", line 92, in
job_manager.run_jobs(
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/extra/job_management.py", line 365, in run_jobs
self._launch_job(start_job, df, i, backend_name)
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/extra/job_management.py", line 400, in _launch_job
job = start_job(
^^^^^^^^^^
File "/data/users/Private/joris.c/lcfm-production/src/sentinel1/pipeline.py", line 87, in start_job
job = connection.create_job(
^^^^^^^^^^^^^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/connection.py", line 1764, in create_job
response = self.post("/jobs", json=pg_with_metadata, expected_status=201)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/connection.py", line 249, in post
return self.request("post", path=path, json=json, allow_redirects=False, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/connection.py", line 816, in request
return _request()
^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/connection.py", line 809, in _request
return super(Connection, self).request(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/connection.py", line 187, in request
self._raise_api_error(resp)
File "/home/joris.c/mambaforge/envs/lcfm-production/lib/python3.11/site-packages/openeo/rest/connection.py", line 207, in _raise_api_error
raise OpenEoApiError(
openeo.rest.OpenEoApiError: [500] Internal: Server error: ConnectionError(ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))) (ref: r-2409226681474a7297d5edff9d792623)