flux-framework / flux-core

core services for the Flux resource management framework
GNU Lesser General Public License v3.0
167 stars 50 forks source link

need a way to force a Flux subinstance to rediscover resources #6418

Open grondo opened 1 day ago

grondo commented 1 day ago

Some modern GPUs have the ability to present one GPU as multiple GPUs, such as AMD compute partition mode.

In order to fully take advantage of this feature, Flux should be able to discover that a single GPU at the system level is now available as multiple (2, 3 or 4) GPUs in the allocation.

The easiest way to accomplish this would be to force the subinstance to rediscover resources with hwloc. This would discover the current state of the job topology instead of reusing the R from the parent. The cached hwloc topology fetched from the parent should also be discarded (or fetching it should be avoided.)

A simple way to force this rediscovery should be provided via the flux alloc and flux batch command line. Perhaps also via some method which could be forced by a prolog (the GPU mode would need to be set by a privileged prolog script, it would be nice if the prolog could enable this mode automatically instead of requiring the user to do it manually)

grondo commented 22 hours ago

Just as a proof of concept, this addition of a resource.rediscover config key seems to work. The question is how we could allow a job prolog that is setting GPU partition mode to enable this config key.

e.g.

$ src/cmd/flux alloc -N4 --conf=resource.rediscover=true flux kvs get resource.eventlog
{"timestamp":1731001918.6667962,"name":"resource-define","context":{"method":"dynamic-discovery"}}
diff --git a/src/modules/resource/inventory.c b/src/modules/resource/inventory.c
index 230de59f0..098cb87f2 100644
--- a/src/modules/resource/inventory.c
+++ b/src/modules/resource/inventory.c
@@ -518,7 +518,9 @@ done:
     return rc;
 }

-static int start_resource_watch (struct inventory *inv, bool no_resource_watch)
+static int start_resource_watch (struct inventory *inv,
+                                 bool no_resource_watch,
+                                 bool rediscover)
 {
     flux_t *h = inv->ctx->h;
     const char *jobid;
@@ -593,7 +595,11 @@ static int start_resource_watch (struct inventory *inv, bool no_resource_watch)
     }
     flux_future_reset (f);
     inv->R_watch_f = f;
-    if (R) { // R = NULL if no conversion possible (fall through to discovery)
+
+    /* If R == NULL (e.g. no conversion possible) or rediscover == true
+     * then fall through to dynamic discovery.
+     */
+    if (!rediscover && R) {
         if (inventory_put (inv, R, "job-info") < 0)
             goto done;
         if (flux_future_then (f,
@@ -898,7 +904,8 @@ void inventory_destroy (struct inventory *inv)

 struct inventory *inventory_create (struct resource_ctx *ctx,
                                     json_t *conf_R,
-                                    bool no_update_watch)
+                                    bool no_update_watch,
+                                    bool rediscover)
 {
     struct inventory *inv;
     json_t *R = NULL;
@@ -915,7 +922,8 @@ struct inventory *inventory_create (struct resource_ctx *ctx,
             goto error;
         if (!inv->R && get_from_kvs (inv, "resource.R") < 0)
             goto error;
-        if (!inv->R && start_resource_watch (inv, no_update_watch) < 0)
+        if (!inv->R
+            && start_resource_watch (inv, no_update_watch, rediscover) < 0)
             goto error;
     }
     else {
diff --git a/src/modules/resource/inventory.h b/src/modules/resource/inventory.h
index db226cb22..39048fe13 100644
--- a/src/modules/resource/inventory.h
+++ b/src/modules/resource/inventory.h
@@ -17,7 +17,8 @@
  */
 struct inventory *inventory_create (struct resource_ctx *ctx,
                                     json_t *R,
-                                    bool no_update_watch);
+                                    bool no_update_watch,
+                                    bool rediscover);
 void inventory_destroy (struct inventory *inv);

 /* Get resource object.
diff --git a/src/modules/resource/resource.c b/src/modules/resource/resource.c
index de70cd893..75f57b6c0 100644
--- a/src/modules/resource/resource.c
+++ b/src/modules/resource/resource.c
@@ -54,6 +54,9 @@
  * norestrict = false
  *   When generating hwloc topology XML, do not restrict to current cpumask
  *
+ * rediscover = false
+ *   Do not fetch R and topology XML from enclosing instance
+ *
  * no-update-watch = false
  *   For testing purposes, simulate missing job-info.update-watch service
  *   in parent instance by sending to an invalid service name.
@@ -65,6 +68,7 @@ static int parse_config (struct resource_ctx *ctx,
                          bool *noverifyp,
                          bool *norestrictp,
                          bool *no_update_watchp,
+                         bool *rediscoverp,
                          flux_error_t *errp)
 {
     flux_error_t error;
@@ -74,12 +78,13 @@ static int parse_config (struct resource_ctx *ctx,
     int noverify = 0;
     int norestrict = 0;
     int no_update_watch = 0;
+    int rediscover = 0;
     json_t *o = NULL;
     json_t *config = NULL;

     if (flux_conf_unpack (conf,
                           &error,
-                          "{s?{s?s s?s s?o s?s s?b s?b s?b !}}",
+                          "{s?{s?s s?s s?o s?s s?b s?b s?b s?b !}}",
                           "resource",
                             "path", &path,
                             "scheduling", &scheduling_path,
@@ -87,7 +92,8 @@ static int parse_config (struct resource_ctx *ctx,
                             "exclude", &exclude,
                             "norestrict", &norestrict,
                             "noverify", &noverify,
-                            "no-update-watch", &no_update_watch) < 0) {
+                            "no-update-watch", &no_update_watch,
+                            "rediscover", &rediscover) < 0) {
         errprintf (errp,
                    "error parsing [resource] configuration: %s",
                    error.text);
@@ -148,6 +154,8 @@ static int parse_config (struct resource_ctx *ctx,
         *norestrictp = norestrict ? true : false;
     if (no_update_watchp)
         *no_update_watchp = no_update_watch ? true : false;
+    if (rediscover)
+        *rediscoverp = rediscover ? true : false;
     if (R)
         *R = o;
     else
@@ -171,7 +179,15 @@ static void config_reload_cb (flux_t *h,

     if (flux_conf_reload_decode (msg, &conf) < 0)
         goto error;
-    if (parse_config (ctx, conf, NULL, NULL, NULL, NULL, NULL, &error) < 0) {
+    if (parse_config (ctx,
+                      conf,
+                      NULL,
+                      NULL,
+                      NULL,
+                      NULL,
+                      NULL,
+                      NULL,
+                      &error) < 0) {
         errstr = error.text;
         goto error;
     }
@@ -338,6 +354,7 @@ int mod_main (flux_t *h, int argc, char **argv)
     bool noverify = false;
     bool norestrict = false;
     bool no_update_watch = false;
+    bool rediscover = false;
     json_t *R_from_config;

     if (!(ctx = resource_ctx_create (h)))
@@ -353,6 +370,7 @@ int mod_main (flux_t *h, int argc, char **argv)
                       &noverify,
                       &norestrict,
                       &no_update_watch,
+                      &rediscover,
                       &error) < 0) {
         flux_log (h, LOG_ERR, "%s", error.text);
         goto error;
@@ -373,7 +391,8 @@ int mod_main (flux_t *h, int argc, char **argv)
      */
     if (!(ctx->inventory = inventory_create (ctx,
                                              R_from_config,
-                                             no_update_watch)))
+                                             no_update_watch,
+                                             rediscover)))
         goto error;
     /*  Done with R_from_config now, so free it.
      */
@@ -408,7 +427,7 @@ int mod_main (flux_t *h, int argc, char **argv)
     /*  topology is initialized after exclude/drain etc since this
      *  rank may attempt to drain itself due to a topology mismatch.
      */
-    if (!(ctx->topology = topo_create (ctx, noverify, norestrict)))
+    if (!(ctx->topology = topo_create (ctx, noverify, norestrict, rediscover)))
         goto error;
     if (!(ctx->monitor = monitor_create (ctx,
                                          inventory_get_size (ctx->inventory),
diff --git a/src/modules/resource/topo.c b/src/modules/resource/topo.c
index 93ca0577e..6c4efe3bc 100644
--- a/src/modules/resource/topo.c
+++ b/src/modules/resource/topo.c
@@ -275,7 +275,9 @@ void topo_destroy (struct topo *topo)
     }
 }

-static char *topo_get_local_xml (struct resource_ctx *ctx, bool no_restrict)
+static char *topo_get_local_xml (struct resource_ctx *ctx,
+                                 bool no_restrict,
+                                 bool rediscover)
 {
     flux_t *parent_h;
     flux_future_t *f = NULL;
@@ -283,7 +285,8 @@ static char *topo_get_local_xml (struct resource_ctx *ctx, bool no_restrict)
     const char *xml;

     errno = 0;
-    if (!(parent_h = resource_parent_handle_open (ctx))
+    if (rediscover
+        || !(parent_h = resource_parent_handle_open (ctx))
         || !(f = flux_rpc (parent_h,
                            "resource.topo-get",
                            NULL,
@@ -321,7 +324,8 @@ out:

 struct topo *topo_create (struct resource_ctx *ctx,
                           bool no_verify,
-                          bool no_restrict)
+                          bool no_restrict,
+                          bool rediscover)
 {
     struct topo *topo;
     json_t *R;
@@ -329,7 +333,7 @@ struct topo *topo_create (struct resource_ctx *ctx,
     if (!(topo = calloc (1, sizeof (*topo))))
         return NULL;
     topo->ctx = ctx;
-    if (!(topo->xml = topo_get_local_xml (ctx, no_restrict))) {
+    if (!(topo->xml = topo_get_local_xml (ctx, no_restrict, rediscover))) {
         flux_log (ctx->h, LOG_ERR, "error loading hwloc topology");
         goto error;
     }
diff --git a/src/modules/resource/topo.h b/src/modules/resource/topo.h
index df28e4482..ca1e297a9 100644
--- a/src/modules/resource/topo.h
+++ b/src/modules/resource/topo.h
@@ -13,7 +13,8 @@

 struct topo *topo_create (struct resource_ctx *ctx,
                           bool no_verify,
-                          bool no_restrict);
+                          bool no_restrict,
+                          bool rediscover);
 void topo_destroy (struct topo *topo);