Open-EO / openeo-geopyspark-driver

OpenEO driver for GeoPySpark (Geotrellis)
Apache License 2.0
26 stars 4 forks source link

YARN integration test fails with: Remote end closed connection without response #720

Closed bossie closed 6 months ago

bossie commented 6 months ago

Integration test test_random_forest_train_and_load_from_jobid failed while polling for job status:

auth_connection = <Connection to 'http://192.168.207.55:39801/openeo/1.1.0' with OidcBearerAuth>
tmp_path = PosixPath('/var/lib/jenkins/workspace/openEO/openeo-integrationtests/pytest-batch-tmp/popen-gw2/test_random_forest_train_and_l0')

    @pytest.mark.batchjob
    @pytest.mark.timeout(BATCH_JOB_TIMEOUT)
    def test_random_forest_train_and_load_from_jobid(auth_connection: openeo.Connection, tmp_path):
        # 1. Train a random forest model.
        FEATURE_COLLECTION_1 = {
            "type": "FeatureCollection",
            "features": [
                {
                    "type": "Feature",
                    "properties": {"target": 3},
                    "geometry": {"type": "Polygon", "coordinates": [[[4.79, 51.26], [4.81, 51.26], [4.81, 51.30], [4.79, 51.30], [4.79, 51.26]]]}
                },
                {
                    "type": "Feature",
                    "properties": {"target": 5},
                    "geometry": {"type": "Polygon", "coordinates": [[[4.85, 51.26], [4.90, 51.26], [4.90, 51.30], [4.85, 51.30], [4.85, 51.26]]]}
                },

            ]
        }

        cube_xybt: DataCube = auth_connection.load_collection(
            "PROBAV_L3_S10_TOC_333M", bands=["NDVI"],
            spatial_extent={"west": 4.78, "east": 4.91, "south": 51.25, "north": 51.31},
            temporal_extent=["2017-11-01", "2017-11-01"]
        )
        cube_xyb: DataCube = cube_xybt.reduce_dimension(dimension="t", reducer="mean")
        predictors: DataCube = cube_xyb.aggregate_spatial(FEATURE_COLLECTION_1, reducer="mean", target_dimension="bands")
        model: MlModel = predictors.fit_class_random_forest(target=FEATURE_COLLECTION_1, num_trees=3, seed=42)
        model: MlModel = model.save_ml_model()
        job: BatchJob = model.create_job(title="test_random_forest_train_and_load_from_jobid-training_step")
        assert job.job_id
        job.start_job()

        # Wait until job is finished
>       status = _poll_job_status(job, until=lambda s: s in ['canceled', 'finished', 'error'])

openeo-geopyspark-integrationtests/tests/test_integration.py:845: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
openeo-geopyspark-integrationtests/tests/test_integration.py:555: in _poll_job_status
    status = job.describe_job()['status']
openeo-geopyspark-integrationtests/venv38/lib64/python3.8/site-packages/openeo/internal/warnings.py:70: in wrapper
    return orig(*args, **kwargs)
openeo-geopyspark-integrationtests/venv38/lib64/python3.8/site-packages/openeo/rest/job.py:68: in describe
    return self.connection.get(f"/jobs/{self.job_id}", expected_status=200).json()
openeo-geopyspark-integrationtests/venv38/lib64/python3.8/site-packages/openeo/rest/connection.py:220: in get
    return self.request("get", path=path, stream=stream, auth=auth, **kwargs)
openeo-geopyspark-integrationtests/venv38/lib64/python3.8/site-packages/openeo/rest/connection.py:769: in request
    return _request()
openeo-geopyspark-integrationtests/venv38/lib64/python3.8/site-packages/openeo/rest/connection.py:762: in _request
    return super(Connection, self).request(
openeo-geopyspark-integrationtests/venv38/lib64/python3.8/site-packages/openeo/rest/connection.py:168: in request
    self._raise_api_error(resp)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <Connection to 'http://192.168.207.55:39801/openeo/1.1.0' with OidcBearerAuth>
response = <Response [500]>

    def _raise_api_error(self, response: requests.Response):
        """Convert API error response to Python exception"""
        status_code = response.status_code
        try:
            info = response.json()
        except Exception:
            info = None

        # Valid JSON object with "code" and "message" fields indicates a proper openEO API error.
        if isinstance(info, dict):
            error_code = info.get("code")
            error_message = info.get("message")
            if error_code and isinstance(error_code, str) and error_message and isinstance(error_message, str):
>               raise OpenEoApiError(
                    http_status_code=status_code,
                    code=error_code,
                    message=error_message,
                    id=info.get("id"),
                    url=info.get("url"),
                )
E               openeo.rest.OpenEoApiError: [500] Internal: Server error: ConnectionError(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))) (ref: r-24031382635c45c3be55dc13aec784a0)

openeo-geopyspark-integrationtests/venv38/lib64/python3.8/site-packages/openeo/rest/connection.py:188: OpenEoApiError
bossie commented 6 months ago

Looking up req_id r-24031382635c45c3be55dc13aec784a0 shows that fetching the job from the EJR failed:

Traceback (most recent call last):
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/connectionpool.py", line 715, in urlopen
    httplib_response = self._make_request(
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/usr/lib64/python3.8/http/client.py", line 1347, in getresponse
    response.begin()
  File "/usr/lib64/python3.8/http/client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "/usr/lib64/python3.8/http/client.py", line 276, in _read_status
    raise RemoteDisconnected("Remote end closed connection without"
http.client.RemoteDisconnected: Remote end closed connection without response

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/venv/lib64/python3.8/site-packages/requests/adapters.py", line 486, in send
    resp = conn.urlopen(
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/connectionpool.py", line 799, in urlopen
    retries = retries.increment(
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/util/retry.py", line 550, in increment
    raise six.reraise(type(error), error, _stacktrace)
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/packages/six.py", line 769, in reraise
    raise value.with_traceback(tb)
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/connectionpool.py", line 715, in urlopen
    httplib_response = self._make_request(
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/usr/lib64/python3.8/http/client.py", line 1347, in getresponse
    response.begin()
  File "/usr/lib64/python3.8/http/client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "/usr/lib64/python3.8/http/client.py", line 276, in _read_status
    raise RemoteDisconnected("Remote end closed connection without"
urllib3.exceptions.ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/venv/lib64/python3.8/site-packages/flask/app.py", line 1484, in full_dispatch_request
    rv = self.dispatch_request()
  File "/opt/venv/lib64/python3.8/site-packages/flask/app.py", line 1469, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
  File "/opt/venv/lib64/python3.8/site-packages/openeo_driver/users/auth.py", line 88, in decorated
    return f(*args, **kwargs)
  File "/opt/venv/lib64/python3.8/site-packages/openeo_driver/views.py", line 889, in get_job_info
    job_info: BatchJobMetadata = backend_implementation.batch_jobs.get_job_info(job_id, user.user_id)
  File "/opt/venv/lib64/python3.8/site-packages/openeogeotrellis/backend.py", line 1829, in get_job_info
    job_metadata = registry.get_job_metadata(job_id, user_id)
  File "/opt/venv/lib64/python3.8/site-packages/openeogeotrellis/job_registry.py", line 810, in get_job_metadata
    ejr_job_info = self.elastic_job_registry.get_job(job_id=job_id, user_id=user_id)
  File "/opt/venv/lib64/python3.8/site-packages/openeo_driver/jobregistry.py", line 414, in get_job
    jobs = self._search(query=query, fields=fields or ["*"])
  File "/opt/venv/lib64/python3.8/site-packages/openeo_driver/jobregistry.py", line 534, in _search
    return self._do_request("POST", "/jobs/search", json=query)
  File "/opt/venv/lib64/python3.8/site-packages/openeo_driver/jobregistry.py", line 320, in _do_request
    response = self._session.request(
  File "/opt/venv/lib64/python3.8/site-packages/requests/sessions.py", line 589, in request
    resp = self.send(prep, **send_kwargs)
  File "/opt/venv/lib64/python3.8/site-packages/requests/sessions.py", line 703, in send
    r = adapter.send(request, **kwargs)
  File "/opt/venv/lib64/python3.8/site-packages/requests/adapters.py", line 501, in send
    raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

The URL to the EJR was effectively the one recently introduced to avoid F5 rate-limiting:

ejr_url

bossie commented 6 months ago

Can't seem to reproduce the issue, maybe a temporary hickup?

Test steps:

1) open a shell in the integrationtests Docker container on epod-openeo-dev with docker exec -it ... bash. 2) set some envars:

OPENEO_BACKEND_CONFIG=/opt/backendconfig_mep.py
OPENEO_ENV=integrationtests
OPENEO_EJR_API=https://jobregistry.vgt.vito.be
OPENEO_EJR_OIDC_CLIENT_CREDENTIALS=...

3) start a Python REPL (in /opt): venv/bin/python 4) run this Python code:

from openeogeotrellis.backend import get_elastic_job_registry
ejr = get_elastic_job_registry()
ejr.health_check()
for _ in range(100):
  ejr.get_job('j-2403131df01e4cbcb656ef2bdcefdd8b', user_id='1ff4f5cf-95cc-4bbb-ad8f-b5096d95006a')["status"]
bossie commented 6 months ago

In the meanwhile, another integration test failed because a download request was unable to log resource usage with the ETL API:

Traceback (most recent call last):
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/connectionpool.py", line 715, in urlopen
    httplib_response = self._make_request(
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/usr/lib64/python3.8/http/client.py", line 1347, in getresponse
    response.begin()
  File "/usr/lib64/python3.8/http/client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "/usr/lib64/python3.8/http/client.py", line 268, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/usr/lib64/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "/usr/lib64/python3.8/ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "/usr/lib64/python3.8/ssl.py", line 1099, in read
    return self._sslobj.read(len, buffer)
ConnectionResetError: [Errno 104] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/venv/lib64/python3.8/site-packages/requests/adapters.py", line 486, in send
    resp = conn.urlopen(
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/connectionpool.py", line 799, in urlopen
    retries = retries.increment(
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/util/retry.py", line 550, in increment
    raise six.reraise(type(error), error, _stacktrace)
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/packages/six.py", line 769, in reraise
    raise value.with_traceback(tb)
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/connectionpool.py", line 715, in urlopen
    httplib_response = self._make_request(
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/opt/venv/lib64/python3.8/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/usr/lib64/python3.8/http/client.py", line 1347, in getresponse
    response.begin()
  File "/usr/lib64/python3.8/http/client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "/usr/lib64/python3.8/http/client.py", line 268, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/usr/lib64/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "/usr/lib64/python3.8/ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "/usr/lib64/python3.8/ssl.py", line 1099, in read
    return self._sslobj.read(len, buffer)
urllib3.exceptions.ProtocolError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/venv/lib64/python3.8/site-packages/flask/app.py", line 1484, in full_dispatch_request
    rv = self.dispatch_request()
  File "/opt/venv/lib64/python3.8/site-packages/flask/app.py", line 1469, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
  File "/opt/venv/lib64/python3.8/site-packages/openeo_driver/users/auth.py", line 88, in decorated
    return f(*args, **kwargs)
  File "/opt/venv/lib64/python3.8/site-packages/openeo_driver/views.py", line 671, in result
    costs = backend_implementation.request_costs(success=True, **request_costs_kwargs)
  File "/opt/venv/lib64/python3.8/site-packages/openeogeotrellis/backend.py", line 1525, in request_costs
    costs = etl_api.log_resource_usage(
  File "/opt/venv/lib64/python3.8/site-packages/openeogeotrellis/integrations/etl_api.py", line 138, in log_resource_usage
    with self._session.post(f"{self._endpoint}/resources", headers={'Authorization': f"Bearer {access_token}"},
  File "/opt/venv/lib64/python3.8/site-packages/requests/sessions.py", line 637, in post
    return self.request("POST", url, data=data, json=json, **kwargs)
  File "/opt/venv/lib64/python3.8/site-packages/requests/sessions.py", line 589, in request
    resp = self.send(prep, **send_kwargs)
  File "/opt/venv/lib64/python3.8/site-packages/requests/sessions.py", line 703, in send
    r = adapter.send(request, **kwargs)
  File "/opt/venv/lib64/python3.8/site-packages/requests/adapters.py", line 501, in send
    raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))

Both of these cases are configured to be retried (openeo_driver.util.http.requests_with_retry) but I'm not sure if this includes connection errors: TBC

bossie commented 6 months ago

Got it: connection errors are retried but POST requests are not, most likely because this type of request is typically not idempotent; in this case (a search request and an idempotent usage report) they are though.

bossie commented 6 months ago

Requests towards the ETL API are all idempotent so POST has been added to the list of retryable verbs.

EJR API search requests are handled in a more ad-hoc way because simply retrying all POST requests will result in duplicate jobs and the EJR API does nothing to prevent that. Instead, the error will be propagated to the user and he will be aware that something might be off.