Open stefan4444 opened 1 year ago
cc @sven1977
I am facing the same issue trying to restore a training interruption using Tuner.restore
results = tuner.fit()
File "/scratch/sk10691/conda-envs/main/lib/python3.10/site-packages/ray/tune/tuner.py", line 347, in fit
return self._local_tuner.fit()
File "/scratch/sk10691/conda-envs/main/lib/python3.10/site-packages/ray/tune/impl/tuner_internal.py", line 590, in fit
analysis = self._fit_resume(trainable, param_space)
File "/scratch/sk10691/conda-envs/main/lib/python3.10/site-packages/ray/tune/impl/tuner_internal.py", line 738, in _fit_resume
analysis = run(**args)
File "/scratch/sk10691/conda-envs/main/lib/python3.10/site-packages/ray/tune/tune.py", line 1036, in run
runner = trial_runner_cls(**runner_kwargs)
File "/scratch/sk10691/conda-envs/main/lib/python3.10/site-packages/ray/tune/execution/tune_controller.py", line 149, in __init__
super().__init__(
File "/scratch/sk10691/conda-envs/main/lib/python3.10/site-packages/ray/tune/execution/trial_runner.py", line 258, in __init__
self.resume(
File "/scratch/sk10691/conda-envs/main/lib/python3.10/site-packages/ray/tune/execution/trial_runner.py", line 506, in resume
trials = self.restore_from_dir()
File "/scratch/sk10691/conda-envs/main/lib/python3.10/site-packages/ray/tune/execution/trial_runner.py", line 444, in restore_from_dir
trial = Trial.from_json_state(trial_json_state)
File "/scratch/sk10691/conda-envs/main/lib/python3.10/site-packages/ray/tune/experiment/trial.py", line 1136, in from_json_state
trial_state = json.loads(json_state, cls=TuneFunctionDecoder)
File "/scratch/sk10691/conda-envs/main/lib/python3.10/json/__init__.py", line 359, in loads
return cls(**kw).decode(s)
File "/scratch/sk10691/conda-envs/main/lib/python3.10/json/decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/scratch/sk10691/conda-envs/main/lib/python3.10/json/decoder.py", line 353, in raw_decode
obj, end = self.scan_once(s, idx)
File "/scratch/sk10691/conda-envs/main/lib/python3.10/site-packages/ray/tune/utils/serialization.py", line 39, in object_hook
return self._from_cloudpickle(obj)
File "/scratch/sk10691/conda-envs/main/lib/python3.10/site-packages/ray/tune/utils/serialization.py", line 43, in _from_cloudpickle
return cloudpickle.loads(hex_to_binary(obj["value"]))
File "/scratch/sk10691/conda-envs/main/lib/python3.10/site-packages/ray/_private/serialization.py", line 105, in _actor_handle_deserializer
return ray.actor.ActorHandle._deserialization_helper(serialized_obj, outer_id)
File "/scratch/sk10691/conda-envs/main/lib/python3.10/site-packages/ray/actor.py", line 1292, in _deserialization_helper
return worker.core_worker.deserialize_and_register_actor_handle(
File "python/ray/_raylet.pyx", line 3503, in ray._raylet.CoreWorker.deserialize_and_register_actor_handle
File "python/ray/_raylet.pyx", line 3472, in ray._raylet.CoreWorker.make_actor_handle
File "/scratch/sk10691/conda-envs/main/lib/python3.10/site-packages/ray/_private/function_manager.py", line 574, in load_actor_class
actor_class = self._load_actor_class_from_gcs(
File "/scratch/sk10691/conda-envs/main/lib/python3.10/site-packages/ray/_private/function_manager.py", line 669, in _load_actor_class_from_gcs
class_name = ensure_str(class_name)
File "/scratch/sk10691/conda-envs/main/lib/python3.10/site-packages/ray/_private/utils.py", line 239, in ensure_str
assert isinstance(s, bytes)
same issue here. Dont know is wrong
I face the same issue. python 3.7.12 ray 2.4.0
Ray generates the following error when creating an RLPredictor using a restored checkpoint in a different Ray session from the one that created the checkpoint. This error does not occur when creating an RLPredictor using a restored checkpoint in the same Ray session that created the checkpoint. Please see repro script below.