jdries commented 1 year ago

Writing large netCDF's make the driver still go out of memory, which should not happen because we write in a streaming manner. Investigate if something is still suboptimal.

{
"process_graph": {
"loadco1": {
"arguments": {
"bands": [
"B03",
"B04",
"B02"
],
"id": "SENTINEL2_L2A",
"spatial_extent":
{ "east": 13.132874787754762, "north": 42.192832302555644, "south": 41.200957744003716, "west": 11.966782528409608 }

,
"temporal_extent": [
"2023-08-18T00:00:00Z",
null
]
},
"description": "Load the data, including the bands:\r\n- G = B03\r\n- R = B04\r\n- B = B02",
"process_id": "load_collection"
},
"reduce1": {
"arguments": {
"data":
{ "from_node": "loadco1" }

,
"dimension": "bands",
"reducer": {
"process_graph": {
"add1": {
"arguments": {
"x":
{ "from_node": "multip2" }

,
"y":
{ "from_node": "arraye2" }

},
"process_id": "add"
},
"add2": {
"arguments": {
"x":
{ "from_node": "add1" }

,
"y":
{ "from_node": "arraye3" }

},
"process_id": "add"
},
"arraye1": {
"arguments": {
"data":
{ "from_parameter": "data" }

,
"index": 0
},
"process_id": "array_element"
},
"arraye2": {
"arguments": {
"data":
{ "from_parameter": "data" }

,
"index": 1
},
"process_id": "array_element"
},
"arraye3": {
"arguments": {
"data":
{ "from_parameter": "data" }

,
"index": 2
},
"process_id": "array_element"
},
"divide1": {
"arguments": {
"x":
{ "from_node": "subtra2" }

,
"y":
{ "from_node": "add2" }

},
"process_id": "divide",
"result": true
},
"multip1": {
"arguments": {
"x": 2,
"y":
{ "from_node": "arraye1" }

},
"process_id": "multiply"
},
"multip2": {
"arguments": {
"x": 2,
"y":
{ "from_node": "arraye1" }

},
"process_id": "multiply"
},
"subtra1": {
"arguments": {
"x":
{ "from_node": "multip1" }

,
"y":
{ "from_node": "arraye2" }

},
"process_id": "subtract"
},
"subtra2": {
"arguments": {
"x":
{ "from_node": "subtra1" }

,
"y":
{ "from_node": "arraye3" }

},
"process_id": "subtract"
}
}
}
},
"description": "Compute the GLI (Green Leaf Index) for the bands dimension\r\nFormula: (2.0 * G - R - B) / (2.0 * G + R + B)",
"process_id": "reduce_dimension"
},
"savere1": {
"arguments": {
"data":
{ "from_node": "reduce1" }

,
"format": "NETCDF"
},
"description": "Store as NETCDF",
"process_id": "save_result",
"result": true
}
}
}

Dave0178 commented 10 months ago

Hello, I'm still unable to run this: s2_cube = connection.load_collection( "SENTINEL2_L2A", bands=["B04", "B03", "B02", "B08", "SCL"], temporal_extent=("2022-01-01", "2022-12-31"), spatial_extent={ 'west': -0.166208030000000, 'south': 11.825353154999900, 'east': 0.083791970000000, 'north': 12.075353154999900, "crs": "EPSG:4326", }, max_cloud_cover=50, )

Options pour augmenter la mémoire disponible

job_options = {"driver-memory": "16G"}

Modèle de chemin pour les fichiers GeoTIFF

output_pattern_tiff = "output/batch_job_nc_gtiff/.tif" output_pattern_netcdf = "output/batch_job_nc_gtiff/.nc"

Obtenir la liste des fichiers correspondant au modèle de chemin

tiff_files = glob.glob(output_pattern_tiff) netcdf_files = glob.glob(output_pattern_netcdf)

Vérifier si des fichiers GeoTIFF existent déjà

if not tiff_files:

# Sauvegarder le cube en format GeoTIFF
s2_cube = s2_cube.save_result(format="GTiff")

# Exécuter le batch job pour le cube en format GeoTIFF
job = s2_cube.execute_batch(title="TIFF Slice of S2 data", job_options=job_options)

# Télécharger les données en GeoTIFF dans le dossier spécifique
results = job.get_results()
results.download_files("output/batch_job_nc_gtiff")

else: print(f"Les fichiers suivants existent déjà: {', '.join(tiff_files)}. Pas de nouvelle sauvegarde.")

if not netcdf_files:

# Sauvegarder le cube en format NetCDF
s2_cube_netcdf = s2_cube.save_result(format="netCDF")

# Exécuter le batch job pour le cube en format NetCDF
job_netcdf = s2_cube_netcdf.execute_batch(title="NetCDF Slice of S2 data")

# Attendre un certain temps (par exemple, 10 secondes) avant de continuer
time.sleep(10)

# Attendre que le job se termine (notez que le temps d'attente dépendra de la durée du job)
job_netcdf.start_and_wait()

# Télécharger les données en NetCDF dans un dossier spécifique
results_netcdf = job_netcdf.get_results()
results_netcdf.download_files("output/batch_job_nc_gtiff")

else: print(f"Les fichiers suivants existent déjà: {', '.join(netcdf_files)}. Pas de nouvelle sauvegarde.")

I'm getting this message :" 0:09:48 Job 'j-5721ff00c1cc4c7fa4f642b6b97d93eb': error (progress N/A) Your batch job 'j-5721ff00c1cc4c7fa4f642b6b97d93eb' failed. Error logs: [{'id': '[1697904790296, 1023937]', 'time': '2023-10-21T16:13:10.296Z', 'level': 'error', 'message': "OpenEO batch job failed: Your batch job failed because the 'driver' used too much java memory. Consider increasing driver-memory or contact the developers to investigate."}] ". Is this still under invastigation?

EmileSonneveld commented 10 months ago

As @jdries mentioned. Maybe we can automatically determine the required data memory when requesting NetCDF?