Run process graphs inside process graphs

jonathom commented 2 years ago

While trying to create a graph for an observed pixel composite (/processes#271) it came up that it is currently not possible to run a process graph inside another process graph. This occurred because we tried to pass a user-input process graph into an apply_dimension process were it should be executed.

Take a look at the following process graph. The parameter timestep_selector should replace the max reducer that is only used as a placeholder inside the apply_dimension process. Instead of max, different process graphs should be executable, depending on input, e.g.min, median.

{
  "process_graph": {
    "2": {
      "process_id": "reduce_dimension",
      "arguments": {
        "data": {
          "from_parameter": "data"
        },
        "reducer": {
          "from_parameter": "criteria"
        },
        "dimension": "bands"
      },
      "position": [
        0,
        0
      ]
    },
    "3": {
      "process_id": "apply_dimension",
      "arguments": {
        "data": {
          "from_node": "2"
        },
        "process": {
          "process_graph": {
            "2": {
              "process_id": "array_find",
              "arguments": {
                "value": {
                  "from_node": "3"
                },
                "data": {
                  "from_parameter": "data"
                }
              }
            },
            "3": {
              "process_id": "max",
              "arguments": {
                "data": {
                  "from_parameter": "data"
                }
              }
            },
            "6": {
              "process_id": "array_apply",
              "arguments": {
                "data": {
                  "from_parameter": "data"
                },
                "process": {
                  "process_graph": {
                    "1": {
                      "process_id": "constant",
                      "arguments": {
                        "x": null
                      },
                      "result": true
                    }
                  }
                }
              }
            },
            "7": {
              "process_id": "array_modify",
              "arguments": {
                "data": {
                  "from_node": "6"
                },
                "values": {
                  "from_node": "3"
                },
                "index": {
                  "from_node": "2"
                }
              },
              "description": "Create an array with null values",
              "result": true
            }
          }
        },
        "dimension": "t"
      },
      "position": [
        240,
        0
      ]
    },
    "4": {
      "process_id": "merge_cubes",
      "arguments": {
        "cube2": {
          "from_node": "3"
        },
        "cube1": {
          "from_parameter": "data"
        },
        "overlap_resolver": {
          "process_graph": {
            "2": {
              "process_id": "if",
              "arguments": {
                "value": {
                  "from_node": "4"
                },
                "accept": {
                  "from_parameter": "x"
                }
              },
              "result": true
            },
            "4": {
              "process_id": "neq",
              "arguments": {
                "x": {
                  "from_parameter": "y"
                },
                "y": null
              }
            }
          }
        }
      },
      "position": [
        480,
        0
      ]
    },
    "5": {
      "process_id": "reduce_dimension",
      "arguments": {
        "data": {
          "from_node": "4"
        },
        "reducer": {
          "process_graph": {
            "1": {
              "process_id": "first",
              "arguments": {
                "data": {
                  "from_parameter": "data"
                }
              },
              "result": true
            }
          }
        },
        "dimension": "t"
      },
      "result": true,
      "position": [
        720,
        0
      ]
    }
  },
  "parameters": [
    {
      "schema": {
        "type": "object",
        "subtype": "raster-cube",
        "title": "Raster data cube",
        "description": "A raster data cube, an image collection stored at the back-end. Different back-ends have different internal representations for this data structure."
      },
      "name": "data"
    },
    {
      "name": "criteria",
      "description": "A reducer to summarize the bands dimension, e.g. ``sum`` or ``mean()``.",
      "schema": {
        "type": "object",
        "subtype": "process-graph",
        "title": "User-defined process",
        "description": "An process graph that is passed as an argument and is expected to be executed by the process. Parameters passed to the process graph are specified in the `parameters` property of the corresponding schema."
      }
    },
    {
      "schema": {
        "type": "object",
        "subtype": "process-graph",
        "title": "User-defined process",
        "description": "An process graph that is passed as an argument and is expected to be executed by the process. Parameters passed to the process graph are specified in the `parameters` property of the corresponding schema.",
        "required": [
          "process_graph"
        ],
        "properties": {
          "process_graph": {
            "type": "object",
            "additionalProperties": {
              "type": "object",
              "required": [
                "process_id",
                "arguments"
              ],
              "properties": {
                "process_id": {
                  "type": "string"
                },
                "arguments": {}
              }
            }
          }
        }
      },
      "name": "timestep_selector",
      "description": "A process to select a timestep, e.g. `max`, `median`. Can't be a process that returns unobserved values, e.g. `mean`."
    }
  ],
  "id": "observed_pixel_composite",
  "description": "Create a composite over time that contains only observed combinations of band values. A single point in time is selected per pixel to fill in all band values."
}

m-mohr commented 2 years ago

We could allow from_parameter/from_node in the process_id property. Or we need a process to run another process by name. Something like call_user_func in PHP or something like var x = 'mean'; x([1,2]) in JS.

soxofaan commented 2 years ago

This is not going to be a API v1.x thing I guess?

m-mohr commented 2 years ago

No, unfortunately, this would be 2.0.

soxofaan commented 2 years ago

A 1.0 compatible solution to implement this kind of pluggable "callback" feature, I think:

store user defined callback as UDP with some predefined process name, e.g. "foo"
call this UDP in the reduce_dimension of the parent graph ({"process_id": "foo", "namespace": "user", ...})

not very scalable and nor maintainable, but it could work

m-mohr commented 2 years ago

For the use case where you want to have a "variable" process id, I think the solution that integrates best without a breaking change would be to create a new process that allows calling another process.

So if you'd need something like this:

{
  "process_graph": {
    "load": {
      "process_id": "load_collection",
      "arguments": {
        "id": "COPERNICUS_30",
        "spatial_extent": null,
        "temporal_extent": null
      }
    },
    "reduce": {
      "process_id": "reduce_dimension",
      "arguments": {
        "reducer": {
          "process_graph": {
            "stat": {
              "process_id": { "from_parameter": "method" }, // <-- this is not possible, but useful
              "arguments": {
                "data": { "from_parameter": "data" }
              },
              "result": true
            }
          }
        },
        "dimension": "bands",
        "data": { "from_node": "load" }
      },
      "result": true
    }
  }
}

Instead, with a new process (e.g. call_process(string id, object arguments) : any) you could achieve this as follows:

{
  "process_graph": {
    "load": {
      "process_id": "load_collection",
      "arguments": {
        "id": "COPERNICUS_30",
        "spatial_extent": null,
        "temporal_extent": null
      }
    },
    "reduce": {
      "process_id": "reduce_dimension",
      "arguments": {
        "reducer": {
          "process_graph": {
            "stat": {
              "process_id": "call_process",
              "arguments": {
                "id": {
                  "from_parameter": "method"
                },
                "arguments": {
                  "data": {
                    "from_parameter": "data"
                  }
                }
              },
              "result": true
            }
          }
        },
        "dimension": "bands",
        "data": {
          "from_node": "load"
        }
      },
      "result": true
    }
  },
  "parameters": [
    {
      "schema": {
        "type": "string",
        "enum": [
          "mean",
          "max",
          "min"
        ]
      },
      "name": "method",
      "description": "Choose a method for the statistic you want to compute."
    }
  ]
}

Is that feasible to implement?

m-mohr commented 2 years ago

PR with a proposal: https://github.com/Open-EO/openeo-processes/pull/307

jdries commented 2 years ago

My current preference goes rather to allowing:

"process_id": { "from_parameter": "method" }

rather than having to do run_process. Both options require backend compatibility, but I don't see one being more breaking than the other. The run_process option has the downside of more complexity IMO.

m-mohr commented 2 years ago

Very biased, but importing or using such process graphs would basically break the Model Builder / Editor and most of the JS tooling and implementing this feature throughour the JS ecosystem would - I assume right now - take about a week of implementation time while a new process would ba about 2/3 hours effort on my side. Not sure how difficult this is on the back-end side to implement run_process. Just pointing it out here, not necesarrily a strong vote for any of the solutions.

Open-EO / openeo-api

Run process graphs inside process graphs #413