Closed jdries closed 1 year ago
The error has now changed to ClientError('An error occurred (404) when calling the CreateBucket operation: Not Found
{"message": "ClientError('An error occurred (404) when calling the CreateBucket operation: Not Found')", "levelname": "ERROR", "name": "openeo_driver.views.error", "created": 1667481184.3216686, "filename": "views.py", "lineno": 258, "process": 104, "exc_info": "Traceback (most recent call last):\n File \"/opt/openeo/lib/python3.8/site-packages/flask/app.py\", line 1516, in full_dispatch_request\n rv = self.dispatch_request()\n File \"/opt/openeo/lib/python3.8/site-packages/flask/app.py\", line 1502, in dispatch_request\n return self.ensure_sync(self.view_functions[rule.endpoint])(**req.view_args)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/users/auth.py\", line 88, in decorated\n return f(*args, **kwargs)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/views.py\", line 809, in queue_job\n backend_implementation.batch_jobs.start_job(job_id=job_id, user=user)\n File \"/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/backend.py\", line 1091, in start_job\n self._start_job(job_id, user.user_id)\n File \"/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/backend.py\", line 1201, in _start_job\n s3_instance.create_bucket(Bucket=bucket)\n File \"/opt/openeo/lib/python3.8/site-packages/botocore/client.py\", line 357, in _api_call\n return self._make_api_call(operation_name, kwargs)\n File \"/opt/openeo/lib/python3.8/site-packages/botocore/client.py\", line 676, in _make_api_call\n raise error_class(parsed_response, operation_name)\nbotocore.exceptions.ClientError: An error occurred (404) when calling the CreateBucket operation: Not Found", "req_id": "r-4738af39e2e54d39a62e36788cdbffd3", "user_id": "cd699ad346138df0ae05cd580df3c01c2c744b5714d71f6cac72a94b6a55399f@egi.eu"}
{"message": "10.233.102.0 - - [03/Nov/2022:13:13:04 +0000] \"POST /openeo/1.1.0/jobs/j-a39ee70ac3224948b3ba0972c6eae993/results HTTP/1.1\" 500 178 \"-\" \"openeo-python-client/0.11.0 cpython/3.9.7 linux\"", "levelname": "INFO", "name": "gunicorn.access", "created": 1667481184.3272822, "filename": "glogging.py", "lineno": 349, "process": 104, "req_id": "no-request", "user_id": null}
The error has now changed to
ClientError('An error occurred (404) when calling the CreateBucket operation: Not Found
{"message": "ClientError('An error occurred (404) when calling the CreateBucket operation: Not Found')", "levelname": "ERROR", "name": "openeo_driver.views.error", "created": 1667481184.3216686, "filename": "views.py", "lineno": 258, "process": 104, "exc_info": "Traceback (most recent call last):\n File \"/opt/openeo/lib/python3.8/site-packages/flask/app.py\", line 1516, in full_dispatch_request\n rv = self.dispatch_request()\n File \"/opt/openeo/lib/python3.8/site-packages/flask/app.py\", line 1502, in dispatch_request\n return self.ensure_sync(self.view_functions[rule.endpoint])(**req.view_args)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/users/auth.py\", line 88, in decorated\n return f(*args, **kwargs)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/views.py\", line 809, in queue_job\n backend_implementation.batch_jobs.start_job(job_id=job_id, user=user)\n File \"/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/backend.py\", line 1091, in start_job\n self._start_job(job_id, user.user_id)\n File \"/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/backend.py\", line 1201, in _start_job\n s3_instance.create_bucket(Bucket=bucket)\n File \"/opt/openeo/lib/python3.8/site-packages/botocore/client.py\", line 357, in _api_call\n return self._make_api_call(operation_name, kwargs)\n File \"/opt/openeo/lib/python3.8/site-packages/botocore/client.py\", line 676, in _make_api_call\n raise error_class(parsed_response, operation_name)\nbotocore.exceptions.ClientError: An error occurred (404) when calling the CreateBucket operation: Not Found", "req_id": "r-4738af39e2e54d39a62e36788cdbffd3", "user_id": "cd699ad346138df0ae05cd580df3c01c2c744b5714d71f6cac72a94b6a55399f@egi.eu"} {"message": "10.233.102.0 - - [03/Nov/2022:13:13:04 +0000] \"POST /openeo/1.1.0/jobs/j-a39ee70ac3224948b3ba0972c6eae993/results HTTP/1.1\" 500 178 \"-\" \"openeo-python-client/0.11.0 cpython/3.9.7 linux\"", "levelname": "INFO", "name": "gunicorn.access", "created": 1667481184.3272822, "filename": "glogging.py", "lineno": 349, "process": 104, "req_id": "no-request", "user_id": null}
Noob alert, we configured the incorrect endpoint.
We have found another issue, that we believe might be the cause for lack of permissions:
{"message": "Exception when calling CustomObjectsApi->list_custom_object: (403)\nReason: Forbidden\nHTTP response headers: HTTPHeaderDict({'Audit-Id': 'aea998c2-5588-409c-83ae-3ebedcef86eb', 'Cache-Control': 'no-cache, private', 'Content-Type': 'application/json', 'X-Content-Type-Options': 'nosniff', 'X-Kubernetes-Pf-Flowschema-Uid': 'bd549fbc-ebe5-4c1b-a000-37e49b58c332', 'X-Kubernetes-Pf-Prioritylevel-Uid': 'ff4f3b61-23c8-4a29-a892-3b6f791730b5', 'Date': 'Thu, 03 Nov 2022 14:51:48 GMT', 'Content-Length': '472'})\nHTTP response body: {\n \"kind\": \"Status\",\n \"apiVersion\": \"v1\",\n \"metadata\": {},\n \"status\": \"Failure\",\n \"message\": \"sparkapplications.sparkoperator.k8s.io is forbidden: User \\\"system:serviceaccount:spark-jobs:spark-operator-1667481074-spark\\\" cannot create resource \\\"sparkapplications\\\" in API group \\\"sparkoperator.k8s.io\\\" in the namespace \\\"spark-jobs\\\"\",\n \"reason\": \"Forbidden\",\n \"details\": {\n \"group\": \"sparkoperator.k8s.io\",\n \"kind\": \"sparkapplications\"\n },\n \"code\": 403\n}\n\n", "levelname": "ERROR", "name": "openeogeotrellis.backend", "created": 1667487101.6491737, "filename": "backend.py", "lineno": 1282, "process": 104, "job_id": "j-97246c9b7ff34848bc61d45f9af7312f", "req_id": "r-7a7545f78e5b47ce891ea4c0fd6aff07", "user_id": "cd699ad346138df0ae05cd580df3c01c2c744b5714d71f6cac72a94b6a55399f@egi.eu"}
{"message": "batch job j-97246c9b7ff34848bc61d45f9af7312f -> error", "levelname": "DEBUG", "name": "openeogeotrellis.job_registry", "created": 1667487101.7084165, "filename": "job_registry.py", "lineno": 124, "process": 104, "req_id": "r-7a7545f78e5b47ce891ea4c0fd6aff07", "user_id": "cd699ad346138df0ae05cd580df3c01c2c744b5714d71f6cac72a94b6a55399f@egi.eu"}
The service account spark-jobs:spark-operator-1667481074-spark
doesn't seem to have permissions for the creation of resources.
We have altered the role and added sparkapplications.sparkoperator.k8s.io
to the role:
...
- apiGroups:
- "sparkoperator.k8s.io"
resources:
- sparkapplications
verbs:
- '*'
...
but the jobs keep on crashing
{"message": "apply_process add_dimension with {'data': {'from_node': 'loadcollection1', 'node': {'process_id': 'load_collection', 'arguments': {'bands': ['B03'], 'id': 'SENTINEL2_L1C_INCD', 'spatial_extent': {'west': -7.682155041704681, 'east': -5.083888440142181, 'south': 36.18203953636458, 'north': 38.620982842287496, 'crs': 'EPSG:4326'}, 'temporal_extent': ['2020-12-27T00:00:00Z', '2020-12-28T00:00:00Z']}}}, 'label': 'SENTINEL2_L1C_INCD', 'name': 'source_name', 'type': 'other'}", "levelname": "DEBUG", "name": "openeo_driver.ProcessGraphDeserializer", "created": 1667489363.8161678, "filename": "ProcessGraphDeserializer.py", "lineno": 1416, "process": 104, "req_id": "r-1fc6fa001d904823be451dfb3197ff0c", "user_id": "cd699ad346138df0ae05cd580df3c01c2c744b5714d71f6cac72a94b6a55399f@egi.eu"}
{"message": "apply_process load_collection with {'bands': ['B03'], 'id': 'SENTINEL2_L1C_INCD', 'spatial_extent': {'west': -7.682155041704681, 'east': -5.083888440142181, 'south': 36.18203953636458, 'north': 38.620982842287496, 'crs': 'EPSG:4326'}, 'temporal_extent': ['2020-12-27T00:00:00Z', '2020-12-28T00:00:00Z']}", "levelname": "DEBUG", "name": "openeo_driver.ProcessGraphDeserializer", "created": 1667489363.8162777, "filename": "ProcessGraphDeserializer.py", "lineno": 1416, "process": 104, "req_id": "r-1fc6fa001d904823be451dfb3197ff0c", "user_id": "cd699ad346138df0ae05cd580df3c01c2c744b5714d71f6cac72a94b6a55399f@egi.eu"}
{"message": "Using process 'load_collection' from namespace 'backend'.", "levelname": "INFO", "name": "openeo_driver.ProcessGraphDeserializer", "created": 1667489363.8164003, "filename": "ProcessGraphDeserializer.py", "lineno": 1521, "process": 104, "req_id": "r-1fc6fa001d904823be451dfb3197ff0c", "user_id": "cd699ad346138df0ae05cd580df3c01c2c744b5714d71f6cac72a94b6a55399f@egi.eu"}
{"message": "Using process 'add_dimension' from namespace 'backend'.", "levelname": "INFO", "name": "openeo_driver.ProcessGraphDeserializer", "created": 1667489363.8168318, "filename": "ProcessGraphDeserializer.py", "lineno": 1521, "process": 104, "req_id": "r-1fc6fa001d904823be451dfb3197ff0c", "user_id": "cd699ad346138df0ae05cd580df3c01c2c744b5714d71f6cac72a94b6a55399f@egi.eu"}
{"message": "Using process 'rename_labels' from namespace 'backend'.", "levelname": "INFO", "name": "openeo_driver.ProcessGraphDeserializer", "created": 1667489363.8170178, "filename": "ProcessGraphDeserializer.py", "lineno": 1521, "process": 104, "req_id": "r-1fc6fa001d904823be451dfb3197ff0c", "user_id": "cd699ad346138df0ae05cd580df3c01c2c744b5714d71f6cac72a94b6a55399f@egi.eu"}
{"message": "Using process 'save_result' from namespace 'backend'.", "levelname": "INFO", "name": "openeo_driver.ProcessGraphDeserializer", "created": 1667489363.8171487, "filename": "ProcessGraphDeserializer.py", "lineno": 1521, "process": 104, "req_id": "r-1fc6fa001d904823be451dfb3197ff0c", "user_id": "cd699ad346138df0ae05cd580df3c01c2c744b5714d71f6cac72a94b6a55399f@egi.eu"}
{"message": "Dry run extracted these source constraints: [(('load_collection', ('SENTINEL2_L1C_INCD', ())), {'temporal_extent': ('2020-12-27T00:00:00Z', '2020-12-28T00:00:00Z'), 'spatial_extent': {'west': -7.682155041704681, 'south': 36.18203953636458, 'east': -5.083888440142181, 'north': 38.620982842287496, 'crs': 'EPSG:4326'}, 'bands': ['B03']})]", "levelname": "INFO", "name": "openeogeotrellis.backend", "created": 1667489363.8173552, "filename": "backend.py", "lineno": 1403, "process": 104, "job_id": "j-d689a452d3554473bcd0991ae7f51bac", "req_id": "r-1fc6fa001d904823be451dfb3197ff0c", "user_id": "cd699ad346138df0ae05cd580df3c01c2c744b5714d71f6cac72a94b6a55399f@egi.eu"}
{"message": "KeyError('sparkApplicationId')", "levelname": "ERROR", "name": "openeo_driver.views.error", "created": 1667489375.7430615, "filename": "views.py", "lineno": 258, "process": 104, "exc_info": "Traceback (most recent call last):\n File \"/opt/openeo/lib/python3.8/site-packages/flask/app.py\", line 1516, in full_dispatch_request\n rv = self.dispatch_request()\n File \"/opt/openeo/lib/python3.8/site-packages/flask/app.py\", line 1502, in dispatch_request\n return self.ensure_sync(self.view_functions[rule.endpoint])(**req.view_args)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/users/auth.py\", line 88, in decorated\n return f(*args, **kwargs)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/views.py\", line 809, in queue_job\n backend_implementation.batch_jobs.start_job(job_id=job_id, user=user)\n File \"/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/backend.py\", line 1091, in start_job\n self._start_job(job_id, user.user_id)\n File \"/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/backend.py\", line 1278, in _start_job\n application_id = status_response['status']['sparkApplicationId']\nKeyError: 'sparkApplicationId'", "req_id": "r-1fc6fa001d904823be451dfb3197ff0c", "user_id": "cd699ad346138df0ae05cd580df3c01c2c744b5714d71f6cac72a94b6a55399f@egi.eu"}
{"message": "10.233.102.0 - - [03/Nov/2022:15:29:35 +0000] \"POST /openeo/1.1.0/jobs/j-d689a452d3554473bcd0991ae7f51bac/results HTTP/1.1\" 500 119 \"-\" \"openeo-python-client/0.11.0 cpython/3.9.7 linux\"", "levelname": "INFO", "name": "gunicorn.access", "created": 1667489375.746691, "filename": "glogging.py", "lineno": 349, "process": 104, "req_id": "no-request", "user_id": null}
Hi guys, it look like all is working except for this error:
cats.data.NonEmptyList; local class incompatible: stream classdesc serialVersionUID = -1984122592033016823, local class serialVersionUID = 4767757887492269335
Any ideas @tcassaert or @jdries?
Hi @zbenta does this happen when starting the job, or does it already get to the point where it's running? This can happen when there's a mismatch of versions somewhere, but if we use the same docker image everywhere, this should not happen.
Hi @zbenta does this happen when starting the job, or does it already get to the point where it's running? This can happen when there's a mismatch of versions somewhere, but if we use the same docker image everywhere, this should not happen.
Problem solved, there is now a new error:
++ id -u
+ myuid=18585
++ id -g
+ mygid=18585
+ set +e
++ getent passwd 18585
+ uidentry=spark:x:18585:18585::/opt/spark/work-dir:/bin/bash
+ set -e
+ '[' -z spark:x:18585:18585::/opt/spark/work-dir:/bin/bash ']'
+ SPARK_CLASSPATH=':/usr/local/spark/jars/*'
+ env
+ grep SPARK_JAVA_OPT_
+ sort -t_ -k4 -n
+ sed 's/[^=]*=\(.*\)/\1/g'
+ readarray -t SPARK_EXECUTOR_JAVA_OPTS
+ '[' -n /opt/openeo-logging-static.jar ']'
+ SPARK_CLASSPATH=':/usr/local/spark/jars/*:/opt/openeo-logging-static.jar'
+ '[' '' == 3 ']'
+ '[' -n /usr/hdp/current/hadoop-client ']'
+ '[' -z '' ']'
++ /usr/hdp/current/hadoop-client/bin/hadoop classpath
/opt/entrypoint.sh: line 57: /usr/hdp/current/hadoop-client/bin/hadoop: No such file or directory
+ export SPARK_DIST_CLASSPATH=
+ SPARK_DIST_CLASSPATH=
+ '[' -z x ']'
+ SPARK_CLASSPATH='/etc/hadoop/conf::/usr/local/spark/jars/*:/opt/openeo-logging-static.jar'
+ echo 'My start command is driver'
My start command is driver
+ case "$1" in
+ shift 1
+ CMD=("$SPARK_HOME/bin/spark-submit" --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" --deploy-mode client "$@")
+ exec /usr/bin/tini -s -- /usr/local/spark/bin/spark-submit --conf spark.driver.bindAddress=10.233.76.55 --deploy-mode client --properties-file /opt/spark/conf/spark.properties --class org.apache.spark.deploy.PythonRunner local:///opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py /batch_jobs/j-4c72f3fc72b545ca927d7d769c2c6d09/job_specification.json /batch_jobs/j-4c72f3fc72b545ca927d7d769c2c6d09 out log job_metadata.json 1.1.0 '[]' cd699ad346138df0ae05cd580df3c01c2c744b5714d71f6cac72a94b6a55399f@egi.eu 0.0
log4j: Using URL [file:/opt/log4j.properties] for automatic log4j configuration.
log4j: Reading configuration from URL file:/opt/log4j.properties
log4j: Parsing for [root] with value=[WARN, openeo].
log4j: Level token is [WARN].
log4j: Category root set to WARN
log4j: Parsing appender named "openeo".
log4j:ERROR Could not instantiate class [org.openeo.logging.JsonLayout].
java.lang.ClassNotFoundException: org.openeo.logging.JsonLayout
at java.base/jdk.internal.loader.BuiltinClassLoader.loadClass(BuiltinClassLoader.java:581)
at java.base/jdk.internal.loader.ClassLoaders$AppClassLoader.loadClass(ClassLoaders.java:178)
at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:522)
at java.base/java.lang.Class.forName0(Native Method)
at java.base/java.lang.Class.forName(Class.java:315)
at org.apache.log4j.helpers.Loader.loadClass(Loader.java:198)
at org.apache.log4j.helpers.OptionConverter.instantiateByClassName(OptionConverter.java:327)
at org.apache.log4j.helpers.OptionConverter.instantiateByKey(OptionConverter.java:124)
at org.apache.log4j.PropertyConfigurator.parseAppender(PropertyConfigurator.java:797)
at org.apache.log4j.PropertyConfigurator.parseCategory(PropertyConfigurator.java:768)
at org.apache.log4j.PropertyConfigurator.configureRootCategory(PropertyConfigurator.java:648)
at org.apache.log4j.PropertyConfigurator.doConfigure(PropertyConfigurator.java:514)
at org.apache.log4j.PropertyConfigurator.doConfigure(PropertyConfigurator.java:580)
at org.apache.log4j.helpers.OptionConverter.selectAndConfigure(OptionConverter.java:526)
at org.apache.log4j.LogManager.<clinit>(LogManager.java:127)
at org.slf4j.impl.Log4jLoggerFactory.<init>(Log4jLoggerFactory.java:66)
at org.slf4j.impl.StaticLoggerBinder.<init>(StaticLoggerBinder.java:72)
at org.slf4j.impl.StaticLoggerBinder.<clinit>(StaticLoggerBinder.java:45)
at org.apache.spark.internal.Logging$.org$apache$spark$internal$Logging$$isLog4j12(Logging.scala:222)
at org.apache.spark.internal.Logging.initializeLogging(Logging.scala:127)
at org.apache.spark.internal.Logging.initializeLogIfNecessary(Logging.scala:111)
+ echo 'My start command is driver'
My start command is driver
+ case "$1" in
+ shift 1
+ CMD=("$SPARK_HOME/bin/spark-submit" --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" --deploy-mode client "$@")
+ exec /usr/bin/tini -s -- /usr/local/spark/bin/spark-submit --conf spark.driver.bindAddress=10.233.76.55 --deploy-mode client --properties-file /opt/spark/conf/spark.properties --class org.apache.spark.deploy.PythonRunner local:///opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py /batch_jobs/j-4c72f3fc72b545ca927d7d769c2c6d09/job_specification.json /batch_jobs/j-4c72f3fc72b545ca927d7d769c2c6d09 out log job_metadata.json 1.1.0 '[]' cd699ad346138df0ae05cd580df3c01c2c744b5714d71f6cac72a94b6a55399f@egi.eu 0.0
log4j: Using URL [file:/opt/log4j.properties] for automatic log4j configuration.
log4j: Reading configuration from URL file:/opt/log4j.properties
log4j: Parsing for [root] with value=[WARN, openeo].
log4j: Level token is [WARN].
log4j: Category root set to WARN
log4j: Parsing appender named "openeo".
log4j:ERROR Could not instantiate class [org.openeo.logging.JsonLayout].
java.lang.ClassNotFoundException: org.openeo.logging.JsonLayout
at java.base/jdk.internal.loader.BuiltinClassLoader.loadClass(BuiltinClassLoader.java:581)
at java.base/jdk.internal.loader.ClassLoaders$AppClassLoader.loadClass(ClassLoaders.java:178)
at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:522)
at java.base/java.lang.Class.forName0(Native Method)
at java.base/java.lang.Class.forName(Class.java:315)
at org.apache.log4j.helpers.Loader.loadClass(Loader.java:198)
at org.apache.log4j.helpers.OptionConverter.instantiateByClassName(OptionConverter.java:327)
at org.apache.log4j.helpers.OptionConverter.instantiateByKey(OptionConverter.java:124)
at org.apache.log4j.PropertyConfigurator.parseAppender(PropertyConfigurator.java:797)
at org.apache.log4j.PropertyConfigurator.parseCategory(PropertyConfigurator.java:768)
at org.apache.log4j.PropertyConfigurator.configureRootCategory(PropertyConfigurator.java:648)
at org.apache.log4j.PropertyConfigurator.doConfigure(PropertyConfigurator.java:514)
at org.apache.log4j.PropertyConfigurator.doConfigure(PropertyConfigurator.java:580)
at org.apache.log4j.helpers.OptionConverter.selectAndConfigure(OptionConverter.java:526)
at org.apache.log4j.LogManager.<clinit>(LogManager.java:127)
at org.slf4j.impl.Log4jLoggerFactory.<init>(Log4jLoggerFactory.java:66)
at org.slf4j.impl.StaticLoggerBinder.<init>(StaticLoggerBinder.java:72)
at org.slf4j.impl.StaticLoggerBinder.<clinit>(StaticLoggerBinder.java:45)
at org.apache.spark.internal.Logging$.org$apache$spark$internal$Logging$$isLog4j12(Logging.scala:222)
at org.apache.spark.internal.Logging.initializeLogging(Logging.scala:127)
at org.apache.spark.internal.Logging.initializeLogIfNecessary(Logging.scala:111)
at org.apache.spark.internal.Logging.initializeLogIfNecessary$(Logging.scala:105)
at org.apache.spark.deploy.SparkSubmit.initializeLogIfNecessary(SparkSubmit.scala:75)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:83)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1043)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1052)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
log4j: Setting property [target] to [System.out].
log4j: Parsed "openeo" options.
log4j: Parsing for [org.apache.spark] with value=[WARN].
log4j: Level token is [WARN].
log4j: Category org.apache.spark set to WARN
log4j: Handling log4j.additivity.org.apache.spark=[null]
log4j: Parsing for [org.openeo.geotrellis.icor] with value=[DEBUG, openeo].
log4j: Level token is [DEBUG].
log4j: Category org.openeo.geotrellis.icor set to DEBUG
log4j: Parsing appender named "openeo".
log4j: Appender "openeo" was already parsed.
log4j: Handling log4j.additivity.org.openeo.geotrellis.icor=[null]
log4j: Parsing for [org.openeo] with value=[INFO, openeo].
log4j: Level token is [INFO].
log4j: Category org.openeo set to INFO
log4j: Parsing appender named "openeo".
log4j: Appender "openeo" was already parsed.
log4j: Handling log4j.additivity.org.openeo=[null]
log4j: Parsing for [org.openeo.geotrellis.OpenEOProcessScriptBuilder] with value=[DEBUG, openeo].
log4j: Level token is [DEBUG].
log4j: Category org.openeo.geotrellis.OpenEOProcessScriptBuilder set to DEBUG
log4j: Parsing appender named "openeo".
log4j: Appender "openeo" was already parsed.
log4j: Handling log4j.additivity.org.openeo.geotrellis.OpenEOProcessScriptBuilder=[null]
log4j: Parsing for [ucar.nc2.jni.netcdf.Nc4Iosp] with value=[DEBUG, openeo].
log4j: Level token is [DEBUG].
log4j: Category ucar.nc2.jni.netcdf.Nc4Iosp set to DEBUG
log4j: Parsing appender named "openeo".
log4j: Appender "openeo" was already parsed.
log4j: Handling log4j.additivity.ucar.nc2.jni.netcdf.Nc4Iosp=[null]
log4j: Parsing for [be.vito.eodata.gwcgeotrellis.opensearch] with value=[DEBUG, openeo].
log4j: Level token is [DEBUG].
log4j: Category be.vito.eodata.gwcgeotrellis.opensearch set to DEBUG
log4j: Parsing appender named "openeo".
log4j: Appender "openeo" was already parsed.
log4j: Handling log4j.additivity.be.vito.eodata.gwcgeotrellis.opensearch=[null]
log4j: Parsing for [com.amazonaws.latency] with value=[DEBUG, openeo].
log4j: Level token is [DEBUG].
log4j: Category com.amazonaws.latency set to DEBUG
log4j: Parsing appender named "openeo".
log4j: Appender "openeo" was already parsed.
log4j: Handling log4j.additivity.com.amazonaws.latency=[null]
log4j: Parsing for [org.openeo.geotrellissentinelhub] with value=[DEBUG, openeo].
log4j: Level token is [DEBUG].
log4j: Category org.openeo.geotrellissentinelhub set to DEBUG
log4j: Parsing appender named "openeo".
log4j: Appender "openeo" was already parsed.
log4j: Handling log4j.additivity.org.openeo.geotrellissentinelhub=[null]
log4j: Parsing for [org.openeo.geotrellis.layers.OpenSearch] with value=[DEBUG, openeo].
log4j: Level token is [DEBUG].
log4j: Category org.openeo.geotrellis.layers.OpenSearch set to DEBUG
log4j: Parsing appender named "openeo".
log4j: Appender "openeo" was already parsed.
log4j: Handling log4j.additivity.org.openeo.geotrellis.layers.OpenSearch=[null]
log4j: Parsing for [com.amazonaws.request] with value=[DEBUG, openeo].
log4j: Level token is [DEBUG].
log4j: Category com.amazonaws.request set to DEBUG
log4j: Parsing appender named "openeo".
log4j: Appender "openeo" was already parsed.
log4j: Handling log4j.additivity.com.amazonaws.request=[null]
log4j: Finished configuring.
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/usr/local/spark-3.2.0/jars/spark-unsafe_2.12-3.2.0.jar) to constructor java.nio.DirectByteBuffer(long,int)
WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
log4j:ERROR No layout set for the appender named [openeo].
We created a configmap and mounted the openeolag4j.properties(as per this file) on the driver and executor:
log4j.appender.openeo=org.apache.log4j.ConsoleAppender
log4j.appender.openeo.target=System.out
log4j.appender.openeo.layout=org.openeo.logging.JsonLayout
log4j.rootLogger=WARN, openeo
log4j.logger.org.apache.spark=WARN
log4j.logger.org.openeo=INFO, openeo
log4j.logger.org.openeo.geotrellis.layers.OpenSearch=DEBUG, openeo
log4j.logger.org.openeo.geotrellis.OpenEOProcessScriptBuilder=DEBUG, openeo
log4j.logger.org.openeo.geotrellis.icor=DEBUG, openeo
log4j.logger.org.openeo.geotrellissentinelhub=DEBUG, openeo
log4j.logger.be.vito.eodata.gwcgeotrellis.opensearch=DEBUG, openeo
log4j.logger.ucar.nc2.jni.netcdf.Nc4Iosp=DEBUG, openeo
log4j.logger.com.amazonaws.latency=DEBUG, openeo
log4j.logger.com.amazonaws.request=DEBUG, openeo
This looks familiar, got introduced by us while trying to improve logging. Will look into it asap and let you know when it's fixed. Related issue: https://github.com/Open-EO/openeo-geopyspark-driver/issues/233
Hi @zbenta , we finished work on our docker images, so the above error should no longer occur. Can you try another run with the latest image?
I also foresee another thing to set up: the batch job will try to write results to object storage, which only works if it knows your endpoint and a bucket exists.
The (S3/object storage) endpoint can be configured with an environment variable in your values.yaml, through SWIFT_URL The bucket itself is hardcoded as OpenEO-data: https://github.com/Open-EO/openeo-geopyspark-driver/blob/master/openeogeotrellis/backend.py#L1254
So you'll want to configure this as well.
Hi @zbenta , we finished work on our docker images, so the above error should no longer occur. Can you try another run with the latest image?
I also foresee another thing to set up: the batch job will try to write results to object storage, which only works if it knows your endpoint and a bucket exists.
The (S3/object storage) endpoint can be configured with an environment variable in your values.yaml, through SWIFT_URL The bucket itself is hardcoded as OpenEO-data: https://github.com/Open-EO/openeo-geopyspark-driver/blob/master/openeogeotrellis/backend.py#L1254
So you'll want to configure this as well.
Thanks for the input @jdries, the error is now the following:
{"lineno":94,"name":"org.apache.spark.SparkContext","process":15,"filename":"Logging.scala","levelname":"ERROR","message":"Error initializing SparkContext.","exc_info":"java.io.FileNotFoundException: No such file or directory: s3a://spark-history/history_prod\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:2344)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:2226)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:2160)\n\tat org.apache.spark.deploy.history.EventLogFileWriter.requireLogBaseDirAsDirectory(EventLogFileWriters.scala:77)\n\tat org.apache.spark.deploy.history.SingleEventLogFileWriter.start(EventLogFileWriters.scala:221)\n\tat org.apache.spark.scheduler.EventLoggingListener.start(EventLoggingListener.scala:83)\n\tat org.apache.spark.SparkContext.<init>(SparkContext.scala:610)\n\tat org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)\n\tat java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)\n\tat java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)\n\tat java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)\n\tat java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:490)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:238)\n\tat py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)\n\tat py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)\n\tat py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)\n\tat py4j.ClientServerConnection.run(ClientServerConnection.java:106)\n\tat java.base/java.lang.Thread.run(Thread.java:829)","created":1668440567.642}
@zbenta do you have a spark-history server deployed?
Hi @zbenta , we finished work on our docker images, so the above error should no longer occur. Can you try another run with the latest image? I also foresee another thing to set up: the batch job will try to write results to object storage, which only works if it knows your endpoint and a bucket exists. The (S3/object storage) endpoint can be configured with an environment variable in your values.yaml, through SWIFT_URL The bucket itself is hardcoded as OpenEO-data: https://github.com/Open-EO/openeo-geopyspark-driver/blob/master/openeogeotrellis/backend.py#L1254 So you'll want to configure this as well.
Thanks for the input @jdries, the error is now the following:
{"lineno":94,"name":"org.apache.spark.SparkContext","process":15,"filename":"Logging.scala","levelname":"ERROR","message":"Error initializing SparkContext.","exc_info":"java.io.FileNotFoundException: No such file or directory: s3a://spark-history/history_prod\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:2344)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:2226)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:2160)\n\tat org.apache.spark.deploy.history.EventLogFileWriter.requireLogBaseDirAsDirectory(EventLogFileWriters.scala:77)\n\tat org.apache.spark.deploy.history.SingleEventLogFileWriter.start(EventLogFileWriters.scala:221)\n\tat org.apache.spark.scheduler.EventLoggingListener.start(EventLoggingListener.scala:83)\n\tat org.apache.spark.SparkContext.<init>(SparkContext.scala:610)\n\tat org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)\n\tat java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)\n\tat java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)\n\tat java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)\n\tat java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:490)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:238)\n\tat py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)\n\tat py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)\n\tat py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)\n\tat py4j.ClientServerConnection.run(ClientServerConnection.java:106)\n\tat java.base/java.lang.Thread.run(Thread.java:829)","created":1668440567.642}
We have also found that the pod has the 'AWS_S3_ENDPOINT' env var defined as:
- name: AWS_S3_ENDPOINT
value: data.cloudferro.com
Where our values.yaml file has the same variable wiht the following value:
AWS_S3_ENDPOINT: "https://stratus-stor.ncg.ingrid.pt:8080"
This url is probably hardcoded somewhere and the changes we do to our values.yaml take no effect while deploying the myspark app.
history
Nope, we didn't knew we needed it. Should we deploy one @tcassaert ? Have you got any values.yaml file we can use as a template?
It's optional, it records the spark-ui for usage after jobs have ended or crashed. To disable it, we'll want to make this configurable: https://github.com/Open-EO/openeo-geopyspark-driver/blob/master/openeogeotrellis/deploy/sparkapplication.yaml.j2#L52
If you'd want to deploy one anyways, we are using following values.yaml:
---
envVars:
SPARK_NO_DAEMONIZE: "false"
SPARK_HISTORY_OPTS: '-Dspark.eventLog.dir=s3a://spark-history/history_prod -Dspark.history.fs.logDirectory=s3a://spark-history/history_prod -Dspark.hadoop.fs.s3a.endpoint=https://s3.waw3-1.cloudferro.com -Dspark.hadoop.fs.s3a.path.style.access=true -Dspark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.EnvironmentVariableCredentialsProvider'
AWS_ACCESS_KEY_ID: "******"
AWS_SECRET_ACCESS_KEY: "********"
ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: traefik
hosts:
- host: spark-history.kube-dev.vgt.vito.be
paths:
- path: '/'
This requires an S3 bucket spark-history
with a history_prod
directory inside.
The chart we use is located at https://github.com/Open-EO/openeo-geotrellis-kubernetes/tree/master/kubernetes/charts/spark-history-server, prepackaged at https://artifactory.vgt.vito.be/helm-charts, named spark-history-server
version 0.0.1
.
@jdries, we tried disabling logs but the problem persists.
When we try to run the jupyter notebook that @Jaapel created, and we try to do an asynchronous download
from utils import get_files_from_dc
#from utils import *
files = get_files_from_dc(
dc,
out_directory=out_dir / "raw",
job_name="get_collection",
local_cache_file=out_dir / "job_cache.json",
recalculate=False
)
The pod that is created has the following environment variable defined:
AWS_S3_ENDPOINT data.cloudferro.com
When we specificly declared it as AWS_S3_ENDPOINT: "https://stratus-stor.ncg.ingrid.pt:8080"
in the values.yaml file.
The jobs are started but then the pods enter a failed state:
And the last message we get from the pod logs is the following:
{"lineno":94,"name":"org.apache.spark.SparkContext","process":15,"filename":"Logging.scala","levelname":"ERROR","message":"Error initializing SparkContext.","exc_info":"java.io.FileNotFoundException: No such file or directory: s3a://spark-history/history_prod\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:2344)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:2226)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:2160)\n\tat org.apache.spark.deploy.history.EventLogFileWriter.requireLogBaseDirAsDirectory(EventLogFileWriters.scala:77)\n\tat org.apache.spark.deploy.history.SingleEventLogFileWriter.start(EventLogFileWriters.scala:221)\n\tat org.apache.spark.scheduler.EventLoggingListener.start(EventLoggingListener.scala:83)\n\tat org.apache.spark.SparkContext.<init>(SparkContext.scala:610)\n\tat org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)\n\tat java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)\n\tat java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)\n\tat java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)\n\tat java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:490)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:238)\n\tat py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)\n\tat py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)\n\tat py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)\n\tat py4j.ClientServerConnection.run(ClientServerConnection.java:106)\n\tat java.base/java.lang.Thread.run(Thread.java:829)","created":1668516800.121}
We also find the following message on the myspark-driver pod:
ERROR TaskSchedulerImpl: Lost an executor 466 (already removed): Unable to create executor due to /opt/geotrellis-extensions-2.3.0_2.12-SNAPSHOT.jar
When we try to do a synchronous download:
dc.download(out_dir / "raw.nc", format="netcdf")
We get the following error:
OpenEoApiError: [500] Internal: Server error: TypeError("'JavaPackage' object is not callable") (ref: r-875e42c467bb4c75b677175d6d9eb581)
Any comments?
After having messed around a little bit we have now spark-history-server running.
If we try to run a Synchronous download we get this message:
OpenEoApiError: [500] Internal: Server error: Unable to determine NoData value. GDAL Exception Code: 4 (ref: r-693459ea114d4c72821921e5917c39a0)
And the following file on the history server:
Great, it seems that now the jobs are actually starting to run. I made the commit above to get rid of this default "data.cloudferro.com" s3 endpoint, which may also help with the GDAL error.
The other problem is that we're currently still mounting a few directories that we expect on the hosts: /eodata, /eodata2, /opt/spark_tmp, /shared_pod_volume
The thing is that we ourselves are also looking to change this, so I hope to fix that on our side.
@tcassaert for the spark config (like the history server), should we perhaps point to a configmap for batch jobs rather than having that hard coded: https://github.com/GoogleCloudPlatform/spark-on-k8s-operator/blob/master/docs/user-guide.md#specifying-spark-configuration
I believe that way cluster operators can tune most of these values themselves, without requiring new images from us or new code?
Yes, that would make sense. I'll have a look at how it can be implemented.
I've been investigating the options to define the sparkConf via a ConfigMap
, but it looks like it doesn't work as described in the docs. I found serveral issues arround it, with the most important ones being:
There might be a PR (https://github.com/Carrefour-Group/spark-on-k8s-operator/pull/2), which I'll check out.
Hi @zbenta , when trying to test synchronous requests on the INCD layer, I indeed get the same error, and it seems to be due to authentication. Is it possible that the https urls returned by the catalog still have some authentication in front, and which type?
I've been investigating the options to define the sparkConf via a
ConfigMap
, but it looks like it doesn't work as described in the docs. I found serveral issues arround it, with the most important ones being:* [Deal with internally created ConfigMap mounted onto SPARK_CONF_DIR GoogleCloudPlatform/spark-on-k8s-operator#216](https://github.com/GoogleCloudPlatform/spark-on-k8s-operator/issues/216) * [Spark not reading default conf provided by sparkConfigMap GoogleCloudPlatform/spark-on-k8s-operator#1183](https://github.com/GoogleCloudPlatform/spark-on-k8s-operator/issues/1183)
There might be a PR (Carrefour-Group/spark-on-k8s-operator#2), which I'll check out.
To follow-up on this: I've requested some more info at the PR to allow the usage of the sparkConfigMap and got following answer: https://github.com/GoogleCloudPlatform/spark-on-k8s-operator/pull/1568#issuecomment-1328212990
So it looks like we can't use a configMap as of now to configure the sparkConf.
Hi @zbenta , when trying to test synchronous requests on the INCD layer, I indeed get the same error, and it seems to be due to authentication. Is it possible that the https urls returned by the catalog still have some authentication in front, and which type?
Hi @jdries , the catalog uses normal user/password authentication for the registration of data, for consulting It needs no authentication, because I can simply access the files within our server.
Take the folowing url, you can see the collection item without any authentication:
https://resto.c-scale.zcu.cz/collections/S2/items/bc241a50-b35d-595d-976d-5a82a6af030b
If you choose one of the images, you can download them without any need for authentication:
Hi @zbenta , you're right, it works now, so synchronous requests are fine! As for batch jobs, I submitted one, but it seems to remain in running state. Can you check what's the state on k8s? My job id is: j-013b1ef5fdf549299efc77d57887612d
Hi @zbenta , you're right, it works now, so synchronous requests are fine! As for batch jobs, I submitted one, but it seems to remain in running state. Can you check what's the state on k8s? My job id is: j-013b1ef5fdf549299efc77d57887612d
Hi @jdries , it crashed:
openeo_driver.errors.CollectionNotFoundException: Collection 'SENTINEL2_L1C_INCD' does not exist.
{"message": "Error processing batch job: CollectionNotFoundException(status_code=404, code='CollectionNotFound', message=\"Collection 'SENTINEL2_L1C_INCD' does not exist.\", id='no-request')", "levelname": "ERROR", "name": "openeo-user-log", "created": 1670004142.807605, "filename": "batch_job.py", "lineno": 359, "process": 64, "exc_info": "Traceback (most recent call last):\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/backend.py\", line 236, in get_collection_metadata\n return self._catalog[collection_id]\nKeyError: 'SENTINEL2_L1C_INCD'\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 355, in main\n run_driver()\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 326, in run_driver\n run_job(\n File \"/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/utils.py\", line 47, in memory_logging_wrapper\n return function(*args, **kwargs)\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 401, in run_job\n result = ProcessGraphDeserializer.evaluate(process_graph, env=env, do_dry_run=tracer)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 331, in evaluate\n convert_node(result_node, env=env.push({ENV_DRY_RUN_TRACER: dry_run_tracer}))\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 349, in convert_node\n process_result = apply_process(process_id=process_id, args=processGraph.get('arguments', {}),\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1470, in apply_process\n args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1470, in <dictcomp>\n args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 361, in convert_node\n return convert_node(processGraph['node'], env=env)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 349, in convert_node\n process_result = apply_process(process_id=process_id, args=processGraph.get('arguments', {}),\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1565, in apply_process\n return process_function(args=args, env=env)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 525, in load_collection\n metadata = env.backend_implementation.catalog.get_collection_metadata(collection_id)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/backend.py\", line 238, in get_collection_metadata\n raise CollectionNotFoundException(collection_id)\nopeneo_driver.errors.CollectionNotFoundException: Collection 'SENTINEL2_L1C_INCD' does not exist.", "job_id": "j-013b1ef5fdf549299efc77d57887612d", "user_id": "dfa678cb9ab17f65d4f025e30fac5e0d90116176e44fd17d703419322747cbbd@egi.eu"}
{"message": "batch_job.py main: fail 2022-12-02 18:02:22.808149, elapsed 0:00:39.339824", "levelname": "INFO", "name": "openeogeotrellis.deploy.batch_job", "created": 1670004142.8082314, "filename": "util.py", "lineno": 348, "process": 64, "job_id": "j-013b1ef5fdf549299efc77d57887612d", "user_id": "dfa678cb9ab17f65d4f025e30fac5e0d90116176e44fd17d703419322747cbbd@egi.eu"}
{"message": "Unhandled CollectionNotFoundException exception: CollectionNotFoundException(status_code=404, code='CollectionNotFound', message=\"Collection 'SENTINEL2_L1C_INCD' does not exist.\", id='no-request')", "levelname": "ERROR", "name": "openeo_driver.util.logging", "created": 1670004142.8083956, "filename": "logging.py", "lineno": 177, "process": 64, "exc_info": "Traceback (most recent call last):\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/backend.py\", line 236, in get_collection_metadata\n return self._catalog[collection_id]\nKeyError: 'SENTINEL2_L1C_INCD'\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 546, in <module>\n main(sys.argv)\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 362, in main\n raise e\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 355, in main\n run_driver()\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 326, in run_driver\n run_job(\n File \"/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/utils.py\", line 47, in memory_logging_wrapper\n return function(*args, **kwargs)\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 401, in run_job\n result = ProcessGraphDeserializer.evaluate(process_graph, env=env, do_dry_run=tracer)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 331, in evaluate\n convert_node(result_node, env=env.push({ENV_DRY_RUN_TRACER: dry_run_tracer}))\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 349, in convert_node\n process_result = apply_process(process_id=process_id, args=processGraph.get('arguments', {}),\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1470, in apply_process\n args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1470, in <dictcomp>\n args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 361, in convert_node\n return convert_node(processGraph['node'], env=env)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 349, in convert_node\n process_result = apply_process(process_id=process_id, args=processGraph.get('arguments', {}),\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1565, in apply_process\n return process_function(args=args, env=env)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 525, in load_collection\n metadata = env.backend_implementation.catalog.get_collection_metadata(collection_id)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/backend.py\", line 238, in get_collection_metadata\n raise CollectionNotFoundException(collection_id)\nopeneo_driver.errors.CollectionNotFoundException: Collection 'SENTINEL2_L1C_INCD' does not exist.", "job_id": "j-013b1ef5fdf549299efc77d57887612d", "user_id": "dfa678cb9ab17f65d4f025e30fac5e0d90116176e44fd17d703419322747cbbd@egi.eu"}
{"lineno":210,"name":"org.apache.hadoop.metrics2.impl.MetricsSystemImpl","process":15,"filename":"MetricsSystemImpl.java","levelname":"INFO","message":"Stopping s3a-file-system metrics system...","created":1670004143.117}
{"lineno":216,"name":"org.apache.hadoop.metrics2.impl.MetricsSystemImpl","process":15,"filename":"MetricsSystemImpl.java","levelname":"INFO","message":"s3a-file-system metrics system stopped.","created":1670004143.119}
{"lineno":611,"name":"org.apache.hadoop.metrics2.impl.MetricsSystemImpl","process":15,"filename":"MetricsSystemImpl.java","levelname":"INFO","message":"s3a-file-system metrics system shutdown complete.","created":1670004143.119}
Hi @zbenta , you're right, it works now, so synchronous requests are fine! As for batch jobs, I submitted one, but it seems to remain in running state. Can you check what's the state on k8s? My job id is: j-013b1ef5fdf549299efc77d57887612d
Hi @jdries , it crashed:
openeo_driver.errors.CollectionNotFoundException: Collection 'SENTINEL2_L1C_INCD' does not exist. {"message": "Error processing batch job: CollectionNotFoundException(status_code=404, code='CollectionNotFound', message=\"Collection 'SENTINEL2_L1C_INCD' does not exist.\", id='no-request')", "levelname": "ERROR", "name": "openeo-user-log", "created": 1670004142.807605, "filename": "batch_job.py", "lineno": 359, "process": 64, "exc_info": "Traceback (most recent call last):\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/backend.py\", line 236, in get_collection_metadata\n return self._catalog[collection_id]\nKeyError: 'SENTINEL2_L1C_INCD'\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 355, in main\n run_driver()\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 326, in run_driver\n run_job(\n File \"/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/utils.py\", line 47, in memory_logging_wrapper\n return function(*args, **kwargs)\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 401, in run_job\n result = ProcessGraphDeserializer.evaluate(process_graph, env=env, do_dry_run=tracer)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 331, in evaluate\n convert_node(result_node, env=env.push({ENV_DRY_RUN_TRACER: dry_run_tracer}))\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 349, in convert_node\n process_result = apply_process(process_id=process_id, args=processGraph.get('arguments', {}),\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1470, in apply_process\n args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1470, in <dictcomp>\n args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 361, in convert_node\n return convert_node(processGraph['node'], env=env)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 349, in convert_node\n process_result = apply_process(process_id=process_id, args=processGraph.get('arguments', {}),\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1565, in apply_process\n return process_function(args=args, env=env)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 525, in load_collection\n metadata = env.backend_implementation.catalog.get_collection_metadata(collection_id)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/backend.py\", line 238, in get_collection_metadata\n raise CollectionNotFoundException(collection_id)\nopeneo_driver.errors.CollectionNotFoundException: Collection 'SENTINEL2_L1C_INCD' does not exist.", "job_id": "j-013b1ef5fdf549299efc77d57887612d", "user_id": "dfa678cb9ab17f65d4f025e30fac5e0d90116176e44fd17d703419322747cbbd@egi.eu"} {"message": "batch_job.py main: fail 2022-12-02 18:02:22.808149, elapsed 0:00:39.339824", "levelname": "INFO", "name": "openeogeotrellis.deploy.batch_job", "created": 1670004142.8082314, "filename": "util.py", "lineno": 348, "process": 64, "job_id": "j-013b1ef5fdf549299efc77d57887612d", "user_id": "dfa678cb9ab17f65d4f025e30fac5e0d90116176e44fd17d703419322747cbbd@egi.eu"} {"message": "Unhandled CollectionNotFoundException exception: CollectionNotFoundException(status_code=404, code='CollectionNotFound', message=\"Collection 'SENTINEL2_L1C_INCD' does not exist.\", id='no-request')", "levelname": "ERROR", "name": "openeo_driver.util.logging", "created": 1670004142.8083956, "filename": "logging.py", "lineno": 177, "process": 64, "exc_info": "Traceback (most recent call last):\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/backend.py\", line 236, in get_collection_metadata\n return self._catalog[collection_id]\nKeyError: 'SENTINEL2_L1C_INCD'\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 546, in <module>\n main(sys.argv)\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 362, in main\n raise e\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 355, in main\n run_driver()\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 326, in run_driver\n run_job(\n File \"/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/utils.py\", line 47, in memory_logging_wrapper\n return function(*args, **kwargs)\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 401, in run_job\n result = ProcessGraphDeserializer.evaluate(process_graph, env=env, do_dry_run=tracer)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 331, in evaluate\n convert_node(result_node, env=env.push({ENV_DRY_RUN_TRACER: dry_run_tracer}))\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 349, in convert_node\n process_result = apply_process(process_id=process_id, args=processGraph.get('arguments', {}),\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1470, in apply_process\n args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1470, in <dictcomp>\n args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 361, in convert_node\n return convert_node(processGraph['node'], env=env)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 349, in convert_node\n process_result = apply_process(process_id=process_id, args=processGraph.get('arguments', {}),\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1565, in apply_process\n return process_function(args=args, env=env)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 525, in load_collection\n metadata = env.backend_implementation.catalog.get_collection_metadata(collection_id)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/backend.py\", line 238, in get_collection_metadata\n raise CollectionNotFoundException(collection_id)\nopeneo_driver.errors.CollectionNotFoundException: Collection 'SENTINEL2_L1C_INCD' does not exist.", "job_id": "j-013b1ef5fdf549299efc77d57887612d", "user_id": "dfa678cb9ab17f65d4f025e30fac5e0d90116176e44fd17d703419322747cbbd@egi.eu"} {"lineno":210,"name":"org.apache.hadoop.metrics2.impl.MetricsSystemImpl","process":15,"filename":"MetricsSystemImpl.java","levelname":"INFO","message":"Stopping s3a-file-system metrics system...","created":1670004143.117} {"lineno":216,"name":"org.apache.hadoop.metrics2.impl.MetricsSystemImpl","process":15,"filename":"MetricsSystemImpl.java","levelname":"INFO","message":"s3a-file-system metrics system stopped.","created":1670004143.119} {"lineno":611,"name":"org.apache.hadoop.metrics2.impl.MetricsSystemImpl","process":15,"filename":"MetricsSystemImpl.java","levelname":"INFO","message":"s3a-file-system metrics system shutdown complete.","created":1670004143.119}
It looks like the collection is well defined because when we list it we get he following:
Our layercatalog.json file is as follows:
[
{
"id": "SENTINEL2_L1C_INCD",
"name": "SENTINEL2_L1C_INCD",
"title": "[experimental] Level 1C Sentinel-2 images",
"description": "[experimental] For C-Scale project experiments only.",
"license": "proprietary",
"keywords": [
"experimental"
],
"providers": [
{
"url": "https://sentinel.esa.int/web/sentinel/user-guides/sentinel-2-msi",
"name": "European Union/ESA/Copernicus",
"roles": [
"producer",
"licensor"
]
}
],
"links": [
{
"rel": "self",
"type": "application/json",
"href": "http://resto.c-scale.zcu.cz/collections/S2"
},
{
"rel": "root",
"type": "application/json",
"href": "http://resto.c-scale.zcu.cz"
},
{
"rel": "items",
"type": "application/geo+json",
"href": "http://resto.c-scale.zcu.cz/collections/S2/items"
},
{
"href": "https://scihub.copernicus.eu/twiki/pub/SciHubWebPortal/TermsConditions/Sentinel_Data_Terms_and_Conditions.pdf",
"rel": "license",
"title": "Legal notice on the use of Copernicus Sentinel Data and Service Information"
}
],
"extent": {
"spatial": {
"bbox": [
[
-7.8897490815567,
36.007460328916,
-4.001463178213,
38.844364822878
]
],
"crs": "http://www.opengis.net/def/crs/OGC/1.3/CRS84"
},
"temporal": {
"interval": [
[
"2018-01-02T11:19:01.000000Z",
"2020-12-30T11:31:07.000000Z"
]
],
"trs": "http://www.opengis.net/def/uom/ISO-8601/0/Gregorian"
}
},
"cube:dimensions": {
"x": {"type": "spatial", "axis": "x", "reference_system": {"$schema":"https://proj.org/schemas/v0.2/projjson.schema.json","type":"GeodeticCRS","name":"AUTO 42001 (Universal Transverse Mercator)","datum":{"type":"GeodeticReferenceFrame","name":"World Geodetic System 1984","ellipsoid":{"name":"WGS 84","semi_major_axis":6378137,"inverse_flattening":298.257223563}},"coordinate_system":{"subtype":"ellipsoidal","axis":[{"name":"Geodetic latitude","abbreviation":"Lat","direction":"north","unit":"degree"},{"name":"Geodetic longitude","abbreviation":"Lon","direction":"east","unit":"degree"}]},"area":"World","bbox":{"south_latitude":-90,"west_longitude":-180,"north_latitude":90,"east_longitude":180},"id":{"authority":"OGC","version":"1.3","code":"Auto42001"}}, "step": 10},
"y": {"type": "spatial", "axis": "y", "reference_system": {"$schema":"https://proj.org/schemas/v0.2/projjson.schema.json","type":"GeodeticCRS","name":"AUTO 42001 (Universal Transverse Mercator)","datum":{"type":"GeodeticReferenceFrame","name":"World Geodetic System 1984","ellipsoid":{"name":"WGS 84","semi_major_axis":6378137,"inverse_flattening":298.257223563}},"coordinate_system":{"subtype":"ellipsoidal","axis":[{"name":"Geodetic latitude","abbreviation":"Lat","direction":"north","unit":"degree"},{"name":"Geodetic longitude","abbreviation":"Lon","direction":"east","unit":"degree"}]},"area":"World","bbox":{"south_latitude":-90,"west_longitude":-180,"north_latitude":90,"east_longitude":180},"id":{"authority":"OGC","version":"1.3","code":"Auto42001"}}, "step": 10},
"t": {
"type": "temporal"
},
"bands": {
"type": "bands",
"values": [
"B01",
"B02",
"B03",
"B04",
"B05",
"B06",
"B07",
"B08",
"B8A",
"B09",
"B10",
"B11",
"B12"
]
}
},
"summaries": {
"datetime": {
"minimum": "2018-01-02T11:19:01.000000Z",
"maximum": "2020-12-30T11:31:07.000000Z"
},
"collection": {
"const": "S2",
"count": 4134,
"type": "string"
},
"day": {
"type": "string",
"oneOf": [
{
"const": "31",
"count": 57
},
{
"const": "30",
"count": 112
},
{
"const": "29",
"count": 132
},
{
"const": "28",
"count": 147
},
{
"const": "27",
"count": 141
},
{
"const": "26",
"count": 165
},
{
"const": "25",
"count": 109
},
{
"const": "24",
"count": 136
},
{
"const": "23",
"count": 139
},
{
"const": "22",
"count": 136
},
{
"const": "21",
"count": 148
},
{
"const": "20",
"count": 99
},
{
"const": "19",
"count": 127
},
{
"const": "18",
"count": 124
},
{
"const": "17",
"count": 131
},
{
"const": "16",
"count": 137
},
{
"const": "15",
"count": 123
},
{
"const": "14",
"count": 130
},
{
"const": "13",
"count": 142
},
{
"const": "12",
"count": 135
},
{
"const": "11",
"count": 152
},
{
"const": "10",
"count": 119
},
{
"const": "09",
"count": 133
},
{
"const": "08",
"count": 145
},
{
"const": "07",
"count": 150
},
{
"const": "06",
"count": 146
},
{
"const": "05",
"count": 113
},
{
"const": "04",
"count": 146
},
{
"const": "03",
"count": 142
},
{
"const": "02",
"count": 157
},
{
"const": "01",
"count": 161
}
]
},
"instruments": {
"const": "msi",
"count": 4134,
"parentId": "platform:sentinel-2b",
"type": "string"
},
"month": {
"type": "string",
"oneOf": [
{
"const": "12",
"count": 314
},
{
"const": "11",
"count": 298
},
{
"const": "10",
"count": 366
},
{
"const": "09",
"count": 374
},
{
"const": "08",
"count": 386
},
{
"const": "07",
"count": 431
},
{
"const": "06",
"count": 368
},
{
"const": "05",
"count": 371
},
{
"const": "04",
"count": 295
},
{
"const": "03",
"count": 297
},
{
"const": "02",
"count": 327
},
{
"const": "01",
"count": 307
}
]
},
"platform": {
"type": "string",
"oneOf": [
{
"const": "sentinel-2b",
"count": 2006
},
{
"const": "sentinel-2a",
"count": 2128
}
]
},
"year": {
"type": "string",
"oneOf": [
{
"const": "2020",
"count": 1356
},
{
"const": "2019",
"count": 1419
},
{
"const": "2018",
"count": 1359
}
]
},
"eo:cloud_cover": {
"minimum": 0,
"maximum": 100
},
"eo:bands": [
{
"name": "B01",
"common_name": "coastal",
"center_wavelength": 4.439,
"gsd": 60
},
{
"name": "B02",
"common_name": "blue",
"center_wavelength": 4.966,
"gsd": 10
},
{
"name": "B03",
"common_name": "green",
"center_wavelength": 5.6,
"gsd": 10
},
{
"name": "B04",
"common_name": "red",
"center_wavelength": 6.645,
"gsd": 10
},
{
"name": "B05",
"center_wavelength": 7.039,
"gsd": 20
},
{
"name": "B06",
"center_wavelength": 7.402,
"gsd": 20
},
{
"name": "B07",
"center_wavelength": 7.825,
"gsd": 20
},
{
"name": "B08",
"common_name": "nir",
"center_wavelength": 8.351,
"gsd": 10
},
{
"name": "B8A",
"center_wavelength": 8.648,
"gsd": 20
},
{
"name": "B09",
"center_wavelength": 9.45,
"gsd": 60
},
{
"name": "B10",
"center_wavelength": 1.3735,
"gsd": 60
},
{
"name": "B11",
"common_name": "swir16",
"center_wavelength": 1.6137,
"gsd": 20
},
{
"name": "B12",
"common_name": "swir22",
"center_wavelength": 2.2024,
"gsd": 20
}
]
},
"_vito": {
"data_source": {
"type": "file-s2",
"opensearch_collection_id": "S2",
"opensearch_endpoint": "https://resto.c-scale.zcu.cz",
"provider:backend": "incd"
}
}
}
]
@tcassaert jobs on incd no longer get status updates, because they do not have the jobtracker cronjob. Can we put a helm chart or documentation for that in our repo?
I've added an example, see https://github.com/Open-EO/openeo-geotrellis-kubernetes#deploy-an-openeo-job-tracker-cron-job.
Hi @jdries @tcassaert , we created a cron job following your example and the pod created gives the following error:
WARNING:openeo_driver.jobregistry.elastic:In context 'init GpsBatchJobs': caught EjrError("Env var 'OPENEO_EJR_OIDC_CLIENT_SECRET' must be set")
Traceback (most recent call last):
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/util/logging.py", line 317, in just_log_exceptions
yield
File "/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/backend.py", line 938, in get_or_build_elastic_job_registry
job_registry = ElasticJobRegistry.from_environ()
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/jobregistry.py", line 107, in from_environ
raise EjrError("Env var 'OPENEO_EJR_OIDC_CLIENT_SECRET' must be set")
openeo_driver.jobregistry.EjrError: Env var 'OPENEO_EJR_OIDC_CLIENT_SECRET' must be set
INFO:openeo_driver.jobregistry.elastic:Creating ElasticJobRegistry with backend_id='undefined' and api_url='https://jobregistry.openeo.vito.be'
WARNING:openeo_driver.jobregistry.elastic:In context 'init JobTracker': caught EjrError("Env var 'OPENEO_EJR_OIDC_CLIENT_SECRET' must be set")
Traceback (most recent call last):
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/util/logging.py", line 317, in just_log_exceptions
yield
File "/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/backend.py", line 938, in get_or_build_elastic_job_registry
job_registry = ElasticJobRegistry.from_environ()
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/jobregistry.py", line 107, in from_environ
raise EjrError("Env var 'OPENEO_EJR_OIDC_CLIENT_SECRET' must be set")
openeo_driver.jobregistry.EjrError: Env var 'OPENEO_EJR_OIDC_CLIENT_SECRET' must be set
{"message": "In context 'init JobTracker': caught EjrError(\"Env var 'OPENEO_EJR_OIDC_CLIENT_SECRET' must be set\")", "levelname": "WARNING", "name": "openeo_driver.jobregistry.elastic", "created": 1673538615.3881013, "filename": "logging.py", "lineno": 319, "process": 1, "exc_info": "Traceback (most recent call last):\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/util/logging.py\", line 317, in just_log_exceptions\n yield\n File \"/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/backend.py\", line 938, in get_or_build_elastic_job_registry\n job_registry = ElasticJobRegistry.from_environ()\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/jobregistry.py\", line 107, in from_environ\n raise EjrError(\"Env var 'OPENEO_EJR_OIDC_CLIENT_SECRET' must be set\")\nopeneo_driver.jobregistry.EjrError: Env var 'OPENEO_EJR_OIDC_CLIENT_SECRET' must be set"}
@soxofaan is it possible to still work without the elasticsearch job tracker?
@cesarpferreira it seems that these messages are just warnings. I checked in our own jobtracker, and it also outputs this so please ignore for now. I now launched a new job against incd, but it remains in running state. Did you leave the jobtracker cron job running or did you disable it?
To be able to get faster insight in these problems myself, I would either need direct access to kubernetes logs, or we'd need to set up the full log aggregation solution for openEO.
Hi @jdries, the jobtracker cron job is still running and we already found it is giving info about jobs after the warnings. We also noticed that when a new job is launched the driver pod is stuck in pending state with the following error:
Unable to attach or mount volumes: unmounted volumes=[http-credentials-configmap], unattached volumes=[shared-pod-volume http-credentials-configmap job-70ab817083-cd699ad346138df0ae05-prom-conf-vol spark-local-dir-1 spark-conf-volume-driver kube-api-access-qqdfp eodata]: timed out waiting for the condition
@cesarpferreira that's the real issue! @tcassaert do we have a helm chart for that credentials configmap? Can be without or dummy credentials.
We have that resource in Terraform, but it can be easily created in Kubernetes:
apiVersion: v1
kind: ConfigMap
metadata:
name: http-credentials
namespace: spark-jobs
data:
http_credentials.json: |-
{
"services.terrascope.be": {
"password": "verysecretpassword",
"username": "some_username"
}
}
The credentials that we are "missing" are supposed to grant us access where? services.terrascope.be ???
Hi @jdries @tcassaert , thank you for the feedback. This issue with the job driver is recent, in previous versions of openeo-geotrellis-kube this error did not happen, the driver used to run and executors were created.
The credentials config can be used for remote data access to catalogs that require basic authentication. services.terrascope.be would be one example. The issue is indeed recent, because we're working very actively on the kubernetes deployment. We need to discuss on friday on how to better stick to a more 'stable' version of the image, but there will always be changes.
We also have this issue in the pipeline: https://github.com/Open-EO/openeo-geopyspark-driver/issues/284 Which would allow you to configure your batch jobs better to not have certain dependencies, like specific configmaps.
We configured the config map with credentials for resto catalog and we got this error in the driver logs:
Your openEO batch job failed: CollectionNotFoundException(status_code=404, code='CollectionNotFound', message="Collection 'SENTINEL2_L1C_INCD' does not exist.", id='no-request')
Traceback (most recent call last):
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/backend.py", line 237, in get_collection_metadata
return self._catalog[collection_id]
KeyError: 'SENTINEL2_L1C_INCD'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py", line 354, in main
run_driver()
File "/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py", line 325, in run_driver
run_job(
File "/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/utils.py", line 49, in memory_logging_wrapper
return function(*args, **kwargs)
File "/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py", line 415, in run_job
result = ProcessGraphDeserializer.evaluate(process_graph, env=env, do_dry_run=tracer)
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py", line 331, in evaluate
convert_node(result_node, env=env.push({ENV_DRY_RUN_TRACER: dry_run_tracer}))
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py", line 349, in convert_node
process_result = apply_process(process_id=process_id, args=processGraph.get('arguments', {}),
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py", line 1470, in apply_process
args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py", line 1470, in <dictcomp>
args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py", line 361, in convert_node
return convert_node(processGraph['node'], env=env)
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py", line 349, in convert_node
process_result = apply_process(process_id=process_id, args=processGraph.get('arguments', {}),
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py", line 1470, in apply_process
args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py", line 1470, in <dictcomp>
args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py", line 361, in convert_node
return convert_node(processGraph['node'], env=env)
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py", line 349, in convert_node
process_result = apply_process(process_id=process_id, args=processGraph.get('arguments', {}),
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py", line 1470, in apply_process
args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py", line 1470, in <dictcomp>
args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py", line 361, in convert_node
return convert_node(processGraph['node'], env=env)
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py", line 349, in convert_node
process_result = apply_process(process_id=process_id, args=processGraph.get('arguments', {}),
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py", line 1565, in apply_process
return process_function(args=args, env=env)
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py", line 525, in load_collection
metadata = env.backend_implementation.catalog.get_collection_metadata(collection_id)
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/backend.py", line 239, in get_collection_metadata
raise CollectionNotFoundException(collection_id)
openeo_driver.errors.CollectionNotFoundException: Collection 'SENTINEL2_L1C_INCD' does not exist.
{"message": "Your openEO batch job failed: CollectionNotFoundException(status_code=404, code='CollectionNotFound', message=\"Collection 'SENTINEL2_L1C_INCD' does not exist.\", id='no-request')", "levelname": "ERROR", "name": "openeo-user-log", "created": 1673948565.2846382, "filename": "batch_job.py", "lineno": 373, "process": 71, "exc_info": "Traceback (most recent call last):\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/backend.py\", line 237, in get_collection_metadata\n return self._catalog[collection_id]\nKeyError: 'SENTINEL2_L1C_INCD'\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 354, in main\n run_driver()\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 325, in run_driver\n run_job(\n File \"/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/utils.py\", line 49, in memory_logging_wrapper\n return function(*args, **kwargs)\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 415, in run_job\n result = ProcessGraphDeserializer.evaluate(process_graph, env=env, do_dry_run=tracer)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 331, in evaluate\n convert_node(result_node, env=env.push({ENV_DRY_RUN_TRACER: dry_run_tracer}))\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 349, in convert_node\n process_result = apply_process(process_id=process_id, args=processGraph.get('arguments', {}),\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1470, in apply_process\n args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1470, in <dictcomp>\n args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 361, in convert_node\n return convert_node(processGraph['node'], env=env)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 349, in convert_node\n process_result = apply_process(process_id=process_id, args=processGraph.get('arguments', {}),\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1470, in apply_process\n args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1470, in <dictcomp>\n args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 361, in convert_node\n return convert_node(processGraph['node'], env=env)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 349, in convert_node\n process_result = apply_process(process_id=process_id, args=processGraph.get('arguments', {}),\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1470, in apply_process\n args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1470, in <dictcomp>\n args = {name: convert_node(expr, env=env) for (name, expr) in sorted(args.items())}\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 361, in convert_node\n return convert_node(processGraph['node'], env=env)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 349, in convert_node\n process_result = apply_process(process_id=process_id, args=processGraph.get('arguments', {}),\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 1565, in apply_process\n return process_function(args=args, env=env)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/ProcessGraphDeserializer.py\", line 525, in load_collection\n metadata = env.backend_implementation.catalog.get_collection_metadata(collection_id)\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/backend.py\", line 239, in get_collection_metadata\n raise CollectionNotFoundException(collection_id)\nopeneo_driver.errors.CollectionNotFoundException: Collection 'SENTINEL2_L1C_INCD' does not exist.", "job_id": "j-bb5783dea52e4c8888ac4ebf4b882c84", "user_id": "cd699ad346138df0ae05cd580df3c01c2c744b5714d71f6cac72a94b6a55399f@egi.eu"}
Also, in the jupyter notebook the job finishes with this output:
2023-01-17 09:43:17,418 - cached_job - INFO -
Your batch job 'j-bb5783dea52e4c8888ac4ebf4b882c84' failed.
Logs can be inspected in an openEO (web) editor or with `connection.job('j-bb5783dea52e4c8888ac4ebf4b882c84').logs()`.
2023-01-17 09:43:17,422 - cached_job - INFO - Printing logs:
2023-01-17 09:44:17,669 - cached_job - INFO - [{'id': '-1', 'code': 'Internal', 'level': 'error', 'message': 'Log collection failed: ConnectionError(\'N/A\', "(<urllib3.connection.HTTPSConnection object at 0x7f7f4d4a2d30>, \'Connection to es-infra.vgt.vito.be timed out. (connect timeout=60)\')", ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f7f4d4a2d30>, \'Connection to es-infra.vgt.vito.be timed out. (connect timeout=60)\'))'}]
Looks like the batch job does not find the custom layercatalog.json. Can you remind me how you eventually configured that for the regular webapp in values.yaml? (Default is /opt/layercatalog.json)
We created a configmap that maps a file we created to /opt/layercatalog.json
ok, so to do the same for batch jobs, we again need this issue to be solved: https://github.com/Open-EO/openeo-geopyspark-driver/issues/284 @JeroenVerstraelen Can you give an idea on when we can have a first version?
@soxofaan is it possible to still work without the elasticsearch job tracker?
@jdries @cesarpferreira
(Sorry for delayed answer, apparently the github notifications on this project end up in different mailbox I normally use)
Indeed, these openeo_driver.jobregistry.elastic
logs are just warnings from a new (experimental) subsystem and can be ignored in this case. Normal execution should not be impacted.
Hi guys, get ready for a long one.
We have looked into https://openeo.org/documentation/1.0/python/#execution to get some extra info on how to interact with the endpoint and we altered the notebook so we can manipulate specific bands.
While connecting to the backend we can list the available collections:
As you can see from the image above the collection name is correct and from the image bellow, you can se that it points to the correct url:
We tried to monitor the job and see if it ran properly, but it halts with a " Connection to es-infra.vgt.vito.be timed out.":
As @cesarpferreira has mentioned in https://github.com/c-scale-community/use-case-aquamonitor/issues/26#issuecomment-1385130001 the collection is not found, which is weird as you can see from the messages above the endpoint maps to the correct collection.
Any thoughts on this?
Yes, as mentioned a few comments back, we need to be able to also configure your custom layercatalog.json for batch jobs. This issue: https://github.com/Open-EO/openeo-geopyspark-driver/issues/284 is now fixed, so we can do exactly that.
We will need to override this file in the driver: /opt/venv/lib64/python3.8/site-packages/openeogeotrellis/deploy/sparkapplication.yaml.j2 I believe we can also do this with a config map.
The contents of this file should extend the block for config maps, I think it would look like (not tested):
{% extends "sparkapplication_template.yaml.j2" %}
{% block volumes %}
{{ super() }}
- name: "layercatalog-configmap"
configMap:
name: "layercatalog"
{% endblock %}
{% block driver_volumeMounts %}
{{ super() }}
- name: "layercatalog-configmap"
mountPath: "/opt/layercatalog.json"
subPath: "layercatalog.json"
{% endblock %}
We can discuss further in the meeting today!
@zbenta Could I get a copy of the error you get (The one saying "looking for
Currently it seems unlikely that its caused by the template variables so I need to have a deeper look into what could be causing it.
Hi everyone, it looks like we fixed previous errors we were having and now the job driver mounts layercatalog.json correctly. We had to change the name of the configmap (in volumes) to the correct one in the configmap yaml file and also fix indentation:
apiVersion: v1
kind: ConfigMap
metadata:
name: sparkapplicationyaml
namespace: spark-jobs
data:
sparkapplication.yaml.j2: |-
{% extends "sparkapplication_template.yaml.j2" %}
{% block volumes %}
{{ super() }}
- name: "incdcatalog"
configMap:
name: "incdcatalog"
{% endblock %}
{% block driver_volumeMounts %}
{{ super() }}
- name: "incdcatalog"
mountPath: "/opt/layercatalog.json"
subPath: "layercatalog.json"
{% endblock %}
Now it seems the jobs start with now problems but then they fail and we can not retrieve the job logs. It is trying to connect to es-infra.vgt.vito.be. Is it possible to configure to change this?
In the job driver pod logs we find:
Your openEO batch job failed: 'Java heap space'
Traceback (most recent call last):
File "/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py", line 354, in main
run_driver()
File "/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py", line 325, in run_driver
run_job(
File "/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/utils.py", line 49, in memory_logging_wrapper
return function(*args, **kwargs)
File "/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py", line 450, in run_job
assets_metadata = result.write_assets(str(output_file))
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/save_result.py", line 112, in write_assets
return self.cube.write_assets(filename=directory, format=self.format, format_options=self.options)
File "/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/geopysparkdatacube.py", line 1715, in write_assets
asset_paths = get_jvm().org.openeo.geotrellis.netcdf.NetCDFRDDWriter.writeRasters(
File "/usr/local/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1309, in __call__
return_value = get_return_value(
File "/usr/local/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.openeo.geotrellis.netcdf.NetCDFRDDWriter.writeRasters.
: java.lang.OutOfMemoryError: Java heap space
at java.base/java.util.Arrays.copyOf(Arrays.java:3745)
at java.base/java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:120)
at java.base/java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:95)
at java.base/java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:156)
at java.base/sun.net.www.http.PosterOutputStream.write(PosterOutputStream.java:78)
at software.amazon.awssdk.utils.IoUtils.copy(IoUtils.java:113)
at software.amazon.awssdk.utils.IoUtils.copy(IoUtils.java:99)
at software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient$RequestCallable.lambda$null$0(UrlConnectionHttpClient.java:208)
at software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient$RequestCallable$$Lambda$3072/0x000000084128e840.get(Unknown Source)
at software.amazon.awssdk.utils.FunctionalUtils.lambda$safeSupplier$4(FunctionalUtils.java:108)
at software.amazon.awssdk.utils.FunctionalUtils$$Lambda$2895/0x00000008411f7c40.get(Unknown Source)
at software.amazon.awssdk.utils.FunctionalUtils.invokeSafely(FunctionalUtils.java:136)
at software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient$RequestCallable.lambda$call$1(UrlConnectionHttpClient.java:208)
at software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient$RequestCallable$$Lambda$3071/0x000000084128e440.accept(Unknown Source)
at java.base/java.util.Optional.ifPresent(Optional.java:183)
at software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient$RequestCallable.call(UrlConnectionHttpClient.java:207)
at software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient$RequestCallable.call(UrlConnectionHttpClient.java:193)
at software.amazon.awssdk.core.internal.util.MetricUtils.measureDurationUnsafe(MetricUtils.java:64)
at software.amazon.awssdk.core.internal.http.pipeline.stages.MakeHttpRequestStage.executeHttpRequest(MakeHttpRequestStage.java:76)
at software.amazon.awssdk.core.internal.http.pipeline.stages.MakeHttpRequestStage.execute(MakeHttpRequestStage.java:55)
at software.amazon.awssdk.core.internal.http.pipeline.stages.MakeHttpRequestStage.execute(MakeHttpRequestStage.java:39)
at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)
at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)
at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)
at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)
at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptTimeoutTrackingStage.execute(ApiCallAttemptTimeoutTrackingStage.java:73)
at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptTimeoutTrackingStage.execute(ApiCallAttemptTimeoutTrackingStage.java:42)
at software.amazon.awssdk.core.internal.http.pipeline.stages.TimeoutExceptionHandlingStage.execute(TimeoutExceptionHandlingStage.java:77)
at software.amazon.awssdk.core.internal.http.pipeline.stages.TimeoutExceptionHandlingStage.execute(TimeoutExceptionHandlingStage.java:39)
at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptMetricCollectionStage.execute(ApiCallAttemptMetricCollectionStage.java:50)
at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptMetricCollectionStage.execute(ApiCallAttemptMetricCollectionStage.java:36)
at software.amazon.awssdk.core.internal.http.pipeline.stages.RetryableStage.execute(RetryableStage.java:64)
{"message": "Your openEO batch job failed: 'Java heap space'", "levelname": "ERROR", "name": "openeo-user-log", "created": 1674227927.7649803, "filename": "batch_job.py", "lineno": 372, "process": 73, "exc_info": "Traceback (most recent call last):\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 354, in main\n run_driver()\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 325, in run_driver\n run_job(\n File \"/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/utils.py\", line 49, in memory_logging_wrapper\n return function(*args, **kwargs)\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 450, in run_job\n assets_metadata = result.write_assets(str(output_file))\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/save_result.py\", line 112, in write_assets\n return self.cube.write_assets(filename=directory, format=self.format, format_options=self.options)\n File \"/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/geopysparkdatacube.py\", line 1715, in write_assets\n asset_paths = get_jvm().org.openeo.geotrellis.netcdf.NetCDFRDDWriter.writeRasters(\n File \"/usr/local/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py\", line 1309, in __call__\n return_value = get_return_value(\n File \"/usr/local/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/protocol.py\", line 326, in get_return_value\n raise Py4JJavaError(\npy4j.protocol.Py4JJavaError: An error occurred while calling z:org.openeo.geotrellis.netcdf.NetCDFRDDWriter.writeRasters.\n: java.lang.OutOfMemoryError: Java heap space\n\tat java.base/java.util.Arrays.copyOf(Arrays.java:3745)\n\tat java.base/java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:120)\n\tat java.base/java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:95)\n\tat java.base/java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:156)\n\tat java.base/sun.net.www.http.PosterOutputStream.write(PosterOutputStream.java:78)\n\tat software.amazon.awssdk.utils.IoUtils.copy(IoUtils.java:113)\n\tat software.amazon.awssdk.utils.IoUtils.copy(IoUtils.java:99)\n\tat software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient$RequestCallable.lambda$null$0(UrlConnectionHttpClient.java:208)\n\tat software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient$RequestCallable$$Lambda$3072/0x000000084128e840.get(Unknown Source)\n\tat software.amazon.awssdk.utils.FunctionalUtils.lambda$safeSupplier$4(FunctionalUtils.java:108)\n\tat software.amazon.awssdk.utils.FunctionalUtils$$Lambda$2895/0x00000008411f7c40.get(Unknown Source)\n\tat software.amazon.awssdk.utils.FunctionalUtils.invokeSafely(FunctionalUtils.java:136)\n\tat software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient$RequestCallable.lambda$call$1(UrlConnectionHttpClient.java:208)\n\tat software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient$RequestCallable$$Lambda$3071/0x000000084128e440.accept(Unknown Source)\n\tat java.base/java.util.Optional.ifPresent(Optional.java:183)\n\tat software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient$RequestCallable.call(UrlConnectionHttpClient.java:207)\n\tat software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient$RequestCallable.call(UrlConnectionHttpClient.java:193)\n\tat software.amazon.awssdk.core.internal.util.MetricUtils.measureDurationUnsafe(MetricUtils.java:64)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.MakeHttpRequestStage.executeHttpRequest(MakeHttpRequestStage.java:76)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.MakeHttpRequestStage.execute(MakeHttpRequestStage.java:55)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.MakeHttpRequestStage.execute(MakeHttpRequestStage.java:39)\n\tat software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)\n\tat software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)\n\tat software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)\n\tat software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptTimeoutTrackingStage.execute(ApiCallAttemptTimeoutTrackingStage.java:73)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptTimeoutTrackingStage.execute(ApiCallAttemptTimeoutTrackingStage.java:42)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.TimeoutExceptionHandlingStage.execute(TimeoutExceptionHandlingStage.java:77)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.TimeoutExceptionHandlingStage.execute(TimeoutExceptionHandlingStage.java:39)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptMetricCollectionStage.execute(ApiCallAttemptMetricCollectionStage.java:50)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallAttemptMetricCollectionStage.execute(ApiCallAttemptMetricCollectionStage.java:36)\n\tat software.amazon.awssdk.core.internal.http.pipeline.stages.RetryableStage.execute(RetryableStage.java:64)\n", "job_id": "j-02cfa23896c9437b8909dd608a402bdb", "user_id": "cd699ad346138df0ae05cd580df3c01c2c744b5714d71f6cac72a94b6a55399f@egi.eu"}
Also, in the jupyter notebook there is no progress info for the job as you can see bellow, maybe something is missing in our configs?
Great! Memory issues can in general be solved by config parameters in your test script. This is an example with most of the configurables, in this case you want to increase driver-memory or executor-memory. Whether or not the memory issue is 'normal' depends on your test script, we could also look at that.
For configuring logging, this requires a setup with filebeat pushing logs into elasticsearch. We briefly touched upon it during the meeting. We can arrange a call with @tcassaert to specifically discuss the details of how we do it.
job_options = {
"driver-memory": "4G",
"driver-memoryOverhead": "1G",
"driver-cores": "1",
"executor-memory": "512m",
"executor-memoryOverhead": "1G",
"executor-cores": "1",
"executor-request-cores": "800m",
"max-executors": "10",
"executor-threads-jvm": "7",
"logging-threshold": "info",
"mount_tmp": False,
"goofys": "false"
}
cube.execute_batch("incd-test-endpoint.tiff",job_options=job_options)
Hi @jdries, thanks for the feedback. We tried to run a job with these job_options and it failed with the following job driver pod logs:
Your openEO batch job failed during Spark execution: 'Job aborted due to stage failure: ShuffleMapStage 14 (map at package.scala:134) has failed the maximum allowable number of times: 4. Most recent failure reason:\norg.apache.spark.shuffle.MetadataF...'
Traceback (most recent call last):
File "/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py", line 354, in main
run_driver()
File "/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py", line 325, in run_driver
run_job(
File "/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/utils.py", line 49, in memory_logging_wrapper
return function(*args, **kwargs)
File "/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py", line 450, in run_job
assets_metadata = result.write_assets(str(output_file))
File "/opt/openeo/lib/python3.8/site-packages/openeo_driver/save_result.py", line 112, in write_assets
return self.cube.write_assets(filename=directory, format=self.format, format_options=self.options)
File "/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/geopysparkdatacube.py", line 1591, in write_assets
timestamped_paths = get_jvm().org.openeo.geotrellis.geotiff.package.saveRDDTemporal(
File "/usr/local/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1309, in __call__
return_value = get_return_value(
File "/usr/local/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.openeo.geotrellis.geotiff.package.saveRDDTemporal.
: org.apache.spark.SparkException: Job aborted due to stage failure: ShuffleMapStage 14 (map at package.scala:134) has failed the maximum allowable number of times: 4. Most recent failure reason:
org.apache.spark.shuffle.MetadataFetchFailedException: Missing an output location for shuffle 5 partition 6
at org.apache.spark.MapOutputTracker$.validateStatus(MapOutputTracker.scala:1580)
at org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$10(MapOutputTracker.scala:1527)
at org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$10$adapted(MapOutputTracker.scala:1526)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at org.apache.spark.MapOutputTracker$.convertMapStatuses(MapOutputTracker.scala:1526)
at org.apache.spark.MapOutputTrackerWorker.getPushBasedShuffleMapSizesByExecutorId(MapOutputTracker.scala:1192)
at org.apache.spark.MapOutputTracker.getMapSizesByExecutorId(MapOutputTracker.scala:543)
at org.apache.spark.shuffle.sort.SortShuffleManager.getReader(SortShuffleManager.scala:140)
at org.apache.spark.shuffle.ShuffleManager.getReader(ShuffleManager.scala:63)
at org.apache.spark.shuffle.ShuffleManager.getReader$(ShuffleManager.scala:57)
at org.apache.spark.shuffle.sort.SortShuffleManager.getReader(SortShuffleManager.scala:73)
at org.apache.spark.rdd.ShuffledRDD.compute(ShuffledRDD.scala:106)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at geotrellis.spark.ContextRDD.compute(ContextRDD.scala:36)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at geotrellis.spark.ContextRDD.compute(ContextRDD.scala:36)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at geotrellis.spark.ContextRDD.compute(ContextRDD.scala:36)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at geotrellis.spark.ContextRDD.compute(ContextRDD.scala:36)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829)
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2403)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2352)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2351)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2351)
at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1792)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2588)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2533)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2522)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:898)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2279)
at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
at org.openeo.geotrellis.geotiff.package$.saveRDDTemporal(package.scala:138)
at org.openeo.geotrellis.geotiff.package.saveRDDTemporal(package.scala)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:829)
{"message": "Your openEO batch job failed during Spark execution: 'Job aborted due to stage failure: ShuffleMapStage 14 (map at package.scala:134) has failed the maximum allowable number of times: 4. Most recent failure reason:\\norg.apache.spark.shuffle.MetadataF...'", "levelname": "ERROR", "name": "openeo-user-log", "created": 1674467417.7287252, "filename": "batch_job.py", "lineno": 372, "process": 71, "exc_info": "Traceback (most recent call last):\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 354, in main\n run_driver()\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 325, in run_driver\n run_job(\n File \"/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/utils.py\", line 49, in memory_logging_wrapper\n return function(*args, **kwargs)\n File \"/opt/openeo/lib64/python3.8/site-packages/openeogeotrellis/deploy/batch_job.py\", line 450, in run_job\n assets_metadata = result.write_assets(str(output_file))\n File \"/opt/openeo/lib/python3.8/site-packages/openeo_driver/save_result.py\", line 112, in write_assets\n return self.cube.write_assets(filename=directory, format=self.format, format_options=self.options)\n File \"/opt/openeo/lib/python3.8/site-packages/openeogeotrellis/geopysparkdatacube.py\", line 1591, in write_assets\n timestamped_paths = get_jvm().org.openeo.geotrellis.geotiff.package.saveRDDTemporal(\n File \"/usr/local/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py\", line 1309, in __call__\n return_value = get_return_value(\n File \"/usr/local/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/protocol.py\", line 326, in get_return_value\n raise Py4JJavaError(\npy4j.protocol.Py4JJavaError: An error occurred while calling z:org.openeo.geotrellis.geotiff.package.saveRDDTemporal.\n: org.apache.spark.SparkException: Job aborted due to stage failure: ShuffleMapStage 14 (map at package.scala:134) has failed the maximum allowable number of times: 4. Most recent failure reason:\norg.apache.spark.shuffle.MetadataFetchFailedException: Missing an output location for shuffle 5 partition 6\n\tat org.apache.spark.MapOutputTracker$.validateStatus(MapOutputTracker.scala:1580)\n\tat org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$10(MapOutputTracker.scala:1527)\n\tat org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$10$adapted(MapOutputTracker.scala:1526)\n\tat scala.collection.Iterator.foreach(Iterator.scala:941)\n\tat scala.collection.Iterator.foreach$(Iterator.scala:941)\n\tat scala.collection.AbstractIterator.foreach(Iterator.scala:1429)\n\tat org.apache.spark.MapOutputTracker$.convertMapStatuses(MapOutputTracker.scala:1526)\n\tat org.apache.spark.MapOutputTrackerWorker.getPushBasedShuffleMapSizesByExecutorId(MapOutputTracker.scala:1192)\n\tat org.apache.spark.MapOutputTracker.getMapSizesByExecutorId(MapOutputTracker.scala:543)\n\tat org.apache.spark.shuffle.sort.SortShuffleManager.getReader(SortShuffleManager.scala:140)\n\tat org.apache.spark.shuffle.ShuffleManager.getReader(ShuffleManager.scala:63)\n\tat org.apache.spark.shuffle.ShuffleManager.getReader$(ShuffleManager.scala:57)\n\tat org.apache.spark.shuffle.sort.SortShuffleManager.getReader(SortShuffleManager.scala:73)\n\tat org.apache.spark.rdd.ShuffledRDD.compute(ShuffledRDD.scala:106)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:337)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:337)\n\tat geotrellis.spark.ContextRDD.compute(ContextRDD.scala:36)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:337)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:337)\n\tat geotrellis.spark.ContextRDD.compute(ContextRDD.scala:36)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:337)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:337)\n\tat geotrellis.spark.ContextRDD.compute(ContextRDD.scala:36)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:337)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:337)\n\tat geotrellis.spark.ContextRDD.compute(ContextRDD.scala:36)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:337)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:337)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:337)\n\tat org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)\n\tat org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:131)\n\tat org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)\n\tat java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)\n\tat java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)\n\tat java.base/java.lang.Thread.run(Thread.java:829)\n\n\tat org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2403)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2352)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2351)\n\tat scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)\n\tat scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2351)\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1792)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2588)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2533)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2522)\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:898)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2279)\n\tat org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:414)\n\tat org.apache.spark.rdd.RDD.collect(RDD.scala:1029)\n\tat org.openeo.geotrellis.geotiff.package$.saveRDDTemporal(package.scala:138)\n\tat org.openeo.geotrellis.geotiff.package.saveRDDTemporal(package.scala)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.base/java.lang.reflect.Method.invoke(Method.java:566)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)\n\tat py4j.ClientServerConnection.run(ClientServerConnection.java:106)\n\tat java.base/java.lang.Thread.run(Thread.java:829)\n", "job_id": "j-5928447cdedd4819854b8d6adc255708", "user_id": "cd699ad346138df0ae05cd580df3c01c2c744b5714d71f6cac72a94b6a55399f@egi.eu"}
Do we also need to change the driver and executor memory in our values.yaml? We have defined 512m of memory for both driver and executor.
For our testing we follow the Jupyter notebook from this repository until establish connection with our endpoint and then we use the following code:
cube = (
con.load_collection(
"SENTINEL2_L1C_INCD",
temporal_extent=["2020-11-27", "2020-12-28"],
bands=[ "B02", "B03", "B04"],
spatial_extent={"west": -7.6, "south": 36.0, "east": -5.0, "north": 38.6},
)
)
job_options = {
"driver-memory": "4G",
"driver-memoryOverhead": "1G",
"driver-cores": "1",
"executor-memory": "512m",
"executor-memoryOverhead": "1G",
"executor-cores": "1",
"executor-request-cores": "800m",
"max-executors": "10",
"executor-threads-jvm": "7",
"logging-threshold": "info",
"mount_tmp": False,
"goofys": "false"
}
result = cube.save_result("GTiff")
job = result.create_job(job_options=job_options)
job.start_and_wait()
Could this error happen because of this volume?
Hi, can you try increasing executor-memory, for instance to '2G'? The error indicates that an executor has crashed, but to find the actual reasons we would need more logs. If you have the 'spark-history' component running, you would also be able to see specific tasks errors in the web interface.
Another thing to try is using a smaller bounding box in your test, like: {"west": -7.6, "south": 36.0, "east": -7.5, "north": 36.1},
Doing this allows us to see if basic things work, before we try running larger areas.
Hi @jdries, We changed in our values.yaml the driver and executor memory to 4Gb and in the job options we made the suggested changes.
For our test we did the following:
cube = (
con.load_collection(
"SENTINEL2_L1C_INCD",
temporal_extent=["2020-11-27", "2020-12-28"],
bands=[ "B02", "B03", "B04"],
spatial_extent={"west": -7.6, "south": 36.0, "east": -7.5, "north": 36.1},
)
)
job_options = {
"driver-memory": "4G",
"driver-memoryOverhead": "1G",
"driver-cores": "1",
"executor-memory": "2G",
"executor-memoryOverhead": "1G",
"executor-cores": "1",
"executor-request-cores": "800m",
"max-executors": "10",
"executor-threads-jvm": "7",
"logging-threshold": "info",
"mount_tmp": False,
"goofys": "false"
}
result = cube.save_result("GTiff")
job = result.create_job(job_options=job_options)
job.start_and_wait()
results = job.get_results()
results.download_files(Path("./output") / "incd_Result.tiff")
We did not get any error for the job execution but we got this error in the last output:
---------------------------------------------------------------------------
OpenEoApiError Traceback (most recent call last)
/tmp/ipykernel_71/1826437420.py in <module>
----> 1 results.download_files(Path("./output") / "incd_Result.tiff")
/opt/conda/lib/python3.9/site-packages/openeo/rest/job.py in download_files(self, target, include_stac_metadata)
412 ensure_dir(target)
413
--> 414 downloaded = [a.download(target) for a in self.get_assets()]
415
416 if include_stac_metadata:
/opt/conda/lib/python3.9/site-packages/openeo/rest/job.py in get_assets(self)
347 """
348 # TODO: add arguments to filter on metadata, e.g. to only get assets of type "image/tiff"
--> 349 metadata = self.get_metadata()
350 if "assets" in metadata:
351 # API 1.0 style: dictionary mapping filenames to metadata dict (with at least a "href" field)
/opt/conda/lib/python3.9/site-packages/openeo/rest/job.py in get_metadata(self, force)
337 """Get batch job results metadata (parsed JSON)"""
338 if self._results is None or force:
--> 339 self._results = self._job.connection.get(self._results_url, expected_status=200).json()
340 return self._results
341
/opt/conda/lib/python3.9/site-packages/openeo/rest/connection.py in get(self, path, stream, auth, **kwargs)
161 :return: response: Response
162 """
--> 163 return self.request("get", path=path, stream=stream, auth=auth, **kwargs)
164
165 def post(self, path, json: dict = None, **kwargs) -> Response:
/opt/conda/lib/python3.9/site-packages/openeo/rest/connection.py in request(self, method, path, headers, auth, check_error, expected_status, **kwargs)
596 try:
597 # Initial request attempt
--> 598 return _request()
599 except OpenEoApiError as api_exc:
600 if api_exc.http_status_code == 403 and api_exc.code == "TokenInvalid":
/opt/conda/lib/python3.9/site-packages/openeo/rest/connection.py in _request()
589 # Do request, but with retry when access token has expired and refresh token is available.
590 def _request():
--> 591 return super(Connection, self).request(
592 method=method, path=path, headers=headers, auth=auth,
593 check_error=check_error, expected_status=expected_status, **kwargs,
/opt/conda/lib/python3.9/site-packages/openeo/rest/connection.py in request(self, method, path, headers, auth, check_error, expected_status, **kwargs)
119 expected_status = ensure_list(expected_status) if expected_status else []
120 if check_error and status >= 400 and status not in expected_status:
--> 121 self._raise_api_error(resp)
122 if expected_status and status not in expected_status:
123 raise OpenEoRestError("Got status code {s!r} for `{m} {p}` (expected {e!r})".format(
/opt/conda/lib/python3.9/site-packages/openeo/rest/connection.py in _raise_api_error(self, response)
150 else:
151 exception = OpenEoApiError(http_status_code=status_code, message=text)
--> 152 raise exception
153
154 def get(self, path, stream=False, auth: AuthBase = None, **kwargs) -> Response:
OpenEoApiError: [500] Internal: Server error: TypeError('can only concatenate str (not "NoneType") to str') (ref: r-e04e1daaec594932a005b20e45032997)
Upgrade openEO to latest version. @zbenta We updated our documentation to reflect latest upgrades to software stack: https://github.com/Open-EO/openeo-geotrellis-kubernetes/commit/a8147962d49555d366b00f64f7b27ff62c5f712e
Can you try again based on those instructions, and report issues here? @tcassaert is available to follow up more quickly (than I can)!