microsoft / FLAML

A fast library for AutoML and tuning. Join our Discord: https://discord.gg/Cppx2vSPVP.
https://microsoft.github.io/FLAML/
MIT License
3.76k stars 495 forks source link

Error in flaml.tune.run #1256

Closed lizhuoq closed 7 months ago

lizhuoq commented 7 months ago

Error

Traceback (most recent call last):

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/tune/experiment.py", line 164, in __init__

    self._run_identifier = Experiment.register_if_needed(run)

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/tune/experiment.py", line 353, in register_if_needed

    register_trainable(name, run_object)

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/tune/registry.py", line 96, in register_trainable

    _global_registry.register(TRAINABLE_CLASS, name, trainable)

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/tune/registry.py", line 180, in register

    self.flush_values()

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/tune/registry.py", line 202, in flush_values

    _internal_kv_put(

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper

    return func(*args, **kwargs)

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put

    return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/_private/gcs_utils.py", line 137, in wrapper

    return f(self, *args, **kwargs)

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/_private/gcs_utils.py", line 228, in internal_kv_put

    reply = self._kv_stub.InternalKVPut(req)

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/grpc/_channel.py", line 946, in __call__

    return _end_unary_response_blocking(state, call, False, None)

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking

    raise _InactiveRpcError(state)

grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:

        status = StatusCode.RESOURCE_EXHAUSTED

        details = "Received message larger than max (182421709 vs. 104857600)"

        debug_error_string = "{"created":"@1699947194.101433625","description":"Error received from peer ipv4:192.168.11.28:49081","file":"src/core/lib/surface/call.cc","file_line":1074,"grpc_message":"Received message larger than max (182421709 vs. 104857600)","grpc_status":8}"

>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):

  File "../src/run_LSTM.py", line 398, in <module>

    result = flaml.tune.run(

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/flaml/tune/tune.py", line 623, in run

    analysis = tune.run(

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/tune/tune.py", line 515, in run

    experiments[i] = Experiment(

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/tune/experiment.py", line 167, in __init__

    raise TuneError(

ray.tune.error.TuneError: The Trainable/training function is too large for grpc resource limit. Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use tune.with_parameters() to put large objects in the Ray object store. 

Original exception: Traceback (most recent call last):

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/tune/experiment.py", line 164, in __init__

    self._run_identifier = Experiment.register_if_needed(run)

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/tune/experiment.py", line 353, in register_if_needed

    register_trainable(name, run_object)

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/tune/registry.py", line 96, in register_trainable

    _global_registry.register(TRAINABLE_CLASS, name, trainable)

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/tune/registry.py", line 180, in register

    self.flush_values()

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/tune/registry.py", line 202, in flush_values

    _internal_kv_put(

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper

    return func(*args, **kwargs)

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/experimental/internal_kv.py", line 88, in _internal_kv_put

    return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/_private/gcs_utils.py", line 137, in wrapper

    return f(self, *args, **kwargs)

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/ray/_private/gcs_utils.py", line 228, in internal_kv_put

    reply = self._kv_stub.InternalKVPut(req)

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/grpc/_channel.py", line 946, in __call__

    return _end_unary_response_blocking(state, call, False, None)

  File "/data/home/scv7343/.conda/envs/timeSeries/lib/python3.8/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking

    raise _InactiveRpcError(state)

grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:

        status = StatusCode.RESOURCE_EXHAUSTED

        details = "Received message larger than max (182421709 vs. 104857600)"

        debug_error_string = "{"created":"@1699947194.101433625","description":"Error received from peer ipv4:192.168.11.28:49081","file":"src/core/lib/surface/call.cc","file_line":1074,"grpc_message":"Received message larger than max (182421709 vs. 104857600)","grpc_status":8}"

Code

result = flaml.tune.run(
      tune.with_parameters(train),
      config=params,
      metric="loss",
      mode="min",
      low_cost_partial_config={"num_epochs": 1},
      max_resource=max_num_epoch,
      min_resource=1,
      scheduler="asha",  # Use asha scheduler to perform early stopping based on intermediate results reported
      resources_per_trial={"cpu": 1, "gpu": gpus_per_trial},
      local_dir=logs_root,
      num_samples=num_samples,
      time_budget_s=time_budget_s,
      use_ray=True
  )

Version