TypeError when run AutoTSEstimator in local server.

lalalapotter commented 1 year ago

Users run following code and get issue

from bigdl.orca import init_orca_context
from bigdl.orca import OrcaContext
from bigdl.orca import init_orca_context
init_orca_context(cluster_mode="local", cores=1)

from bigdl.chronos.autots.model.auto_prophet import AutoProphet
from bigdl.chronos.autots.model.auto_prophet import AutoProphet
auto_prophet = AutoProphet()
auto_prophet.fit(data=train_data,
cross_validation=False,
freq="1D")

print("Training completed.")

import bigdl.orca.automl.hp as hp
from bigdl.chronos.autots import AutoTSEstimator
auto_estimator = AutoTSEstimator(model='lstm', # the model name used for training
search_space='normal', # a default hyper parameter search space
past_seq_len=hp.randint(1, 10))

Error log:

2023-07-26 11:24:36,254    WARNING callback.py:144 -- The TensorboardX logger cannot be instantiated because either TensorboardX or one of it's dependencies is not installed. Please make sure you have the latest version of TensorboardX installed: `pip install -U tensorboardx`
Tune Status
Current time:    2023-07-26 11:24:39
Running for:    00:00:03.66
Memory:    12.4/251.8 GiB
System Info
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/10 CPUs, 0/0 GPUs
Messages
Number of errored trials: 1
Trial name    # failures    error file
train_func_00791_00000    1    /tmp/autots_estimator/autots_estimator/train_func_00791_00000/error.txt
Trial Status
Trial name    status    loc
train_func_00791_00000    ERROR    10.0.2.124:1230344
(train_func pid=1230344) Global seed set to 2809885085
2023-07-26 11:24:39,912    ERROR tune_controller.py:873 -- Trial task failed for trial train_func_00791_00000
Traceback (most recent call last):
  File "/home/siawchen/.local/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/siawchen/.local/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 18, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/siawchen/.local/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/siawchen/.local/lib/python3.10/site-packages/ray/_private/worker.py", line 2540, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(TypeError): ray::ImplicitFunc.train() (pid=1230344, ip=10.0.2.124, actor_id=af090472efc8d7fb9c4e401401000000, repr=train_func)
  File "/home/siawchen/.local/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 389, in train
    raise skipped from exception_cause(skipped)
  File "/home/siawchen/.local/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 336, in entrypoint
    return self._trainable_func(
  File "/home/siawchen/.local/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 653, in _trainable_func
    output = fn()
  File "/home/siawchen/.local/lib/python3.10/site-packages/bigdl/orca/automl/search/ray_tune/ray_tune_search_engine.py", line 339, in train_func
    trial_model = model_builder.build(config)
  File "/home/siawchen/.local/lib/python3.10/site-packages/bigdl/orca/automl/model/base_pytorch_model.py", line 412, in build
    model.build(config)
  File "/home/siawchen/.local/lib/python3.10/site-packages/bigdl/orca/automl/model/base_pytorch_model.py", line 81, in build
    self.model = self.model_creator(config)
  File "/home/siawchen/.local/lib/python3.10/site-packages/bigdl/chronos/model/VanillaLSTM_pytorch.py", line 74, in model_creator
    model = LSTMModel(input_dim=config["input_feature_num"],
  File "/home/siawchen/.local/lib/python3.10/site-packages/bigdl/chronos/model/VanillaLSTM_pytorch.py", line 33, in __init__
    lstm_list.append(nn.LSTM(input_dim, self.hidden_dim[layer],
  File "/home/siawchen/.local/lib/python3.10/site-packages/torch/nn/modules/rnn.py", line 678, in __init__
    super(LSTM, self).__init__('LSTM', *args, **kwargs)
  File "/home/siawchen/.local/lib/python3.10/site-packages/torch/nn/modules/rnn.py", line 94, in __init__
    w_ih = Parameter(torch.empty((gate_size, layer_input_size), **factory_kwargs))
TypeError: empty(): argument 'size' must be tuple of SymInts, but found element of type NoneType at pos 2
---------------------------------------------------------------------------
TuneError                                 Traceback (most recent call last)
Cell In[20], line 1
----> 1 tsppl = autoest.fit(data=train_data,
      2                     validation_data=validation_data)

File ~/.local/lib/python3.10/site-packages/bigdl/chronos/autots/autotsestimator.py:274, in AutoTSEstimator.fit(self, data, epochs, batch_size, validation_data, metric_threshold, n_sampling, search_alg, search_alg_params, scheduler, scheduler_params)
    258     self.model.fit(
    259         data=train_d,
    260         epochs=epochs,
   (...)
    270         scheduler_params=scheduler_params,
    271     )
    273 if not is_third_party_model:
--> 274     self.model.fit(
    275         data=train_d,
    276         epochs=epochs,
    277         batch_size=batch_size,
    278         validation_data=val_d,
    279         metric_threshold=metric_threshold,
    280         n_sampling=n_sampling,
    281         search_alg=search_alg,
    282         search_alg_params=search_alg_params,
    283         scheduler=scheduler,
    284         scheduler_params=scheduler_params
    285     )
    287 if self.backend == "torch":
    288     from bigdl.chronos.autots.tspipeline import TSPipeline

File ~/.local/lib/python3.10/site-packages/bigdl/chronos/autots/model/base_automodel.py:88, in BaseAutomodel.fit(self, data, epochs, batch_size, validation_data, metric_threshold, n_sampling, search_alg, search_alg_params, scheduler, scheduler_params)
     85 self.search_space["batch_size"] = batch_size
     86 n_sampling = recalculate_n_sampling(self.search_space,
     87                                     n_sampling) if n_sampling != -1 else -1
---> 88 self.auto_est.fit(
     89     data=data,
     90     epochs=epochs,
     91     validation_data=validation_data,
     92     metric=self.metric,
     93     metric_mode=self.metric_mode,
     94     metric_threshold=metric_threshold,
     95     n_sampling=n_sampling,
     96     search_space=self.search_space,
     97     search_alg=search_alg,
     98     search_alg_params=search_alg_params,
     99     scheduler=scheduler,
    100     scheduler_params=scheduler_params,
    101 )
    102 self.best_model = self.auto_est._get_best_automl_model()
    103 self.best_config = self.auto_est.get_best_config()

File ~/.local/lib/python3.10/site-packages/bigdl/orca/automl/auto_estimator.py:217, in AutoEstimator.fit(self, data, epochs, validation_data, metric, metric_mode, metric_threshold, n_sampling, search_space, search_alg, search_alg_params, scheduler, scheduler_params, feature_cols, label_cols)
    197 feature_cols, label_cols = AutoEstimator._check_spark_dataframe_input(data,
    198                                                                       validation_data,
    199                                                                       feature_cols,
    200                                                                       label_cols)
    202 self.searcher.compile(data=data,
    203                       model_builder=self.model_builder,
    204                       epochs=epochs,
   (...)
    215                       feature_cols=feature_cols,
    216                       label_cols=label_cols)
--> 217 self.searcher.run()
    218 self._fitted = True

File ~/.local/lib/python3.10/site-packages/bigdl/orca/automl/search/ray_tune/ray_tune_search_engine.py:185, in RayTuneSearchEngine.run(self)
    183 metric = self.metric_name if not self._scheduler else None
    184 mode = self.mode if not self._scheduler else None
--> 185 analysis = tune.run(
    186     self.train_func,
    187     local_dir=self.logs_dir,
    188     metric=metric,
    189     mode=mode,
    190     name=self.name,
    191     stop=self.stopper,
    192     config=self.search_space,
    193     search_alg=self._search_alg,
    194     num_samples=self.num_samples,
    195     trial_dirname_creator=trial_dirname_creator,
    196     callbacks=[CustomProgressCallback()],
    197     scheduler=self._scheduler,
    198     resources_per_trial=self.resources_per_trial,
    199     verbose=3,
    200     reuse_actors=True
    201 )
    202 self.trials = analysis.trials
    204 # Visualization code for ray (leaderboard)

File ~/.local/lib/python3.10/site-packages/ray/tune/tune.py:1105, in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, storage_path, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, checkpoint_keep_all_ranks, checkpoint_upload_from_workers, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, chdir_to_trial_dir, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, reuse_actors, raise_on_failed_trial, callbacks, max_concurrent_trials, trial_executor, local_dir, _experiment_checkpoint_dir, _remote, _remote_string_queue, _entrypoint)
   1103 if incomplete_trials:
   1104     if raise_on_failed_trial and not experiment_interrupted_event.is_set():
-> 1105         raise TuneError("Trials did not complete", incomplete_trials)
   1106     else:
   1107         logger.error("Trials did not complete: %s", incomplete_trials)

TuneError: ('Trials did not complete', [train_func_00791_00000])

plusbang commented 1 year ago

As introduced in document, we currently support Chronos on Python 3.7 ~ 3.9. But according to logs, the environment is Python 3.10. Could they run on Python3.9 and see whether issue still exists?

Madhustat commented 1 year ago

Hi Thank you for the update i have downgraded and running on my PRC server now i am seeing below error

2023-07-26 22:39:59 WARN ClientServerConnection:200 - Error occurred while waiting for a command. py4j.Py4JException: Unknown command received: null at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:187) at py4j.ClientServerConnection.run(ClientServerConnection.java:106) at java.base/java.lang.Thread.run(Thread.java:829) 2023-07-26 22:40:00,985 ERROR services.py:1207 -- Failed to start the dashboard , return code 1 2023-07-26 22:40:00,987 ERROR services.py:1232 -- Error should be written to 'dashboard.log' or 'dashboard.err'. We are printing the last 20 lines for you. See 'https://docs.ray.io/en/master/ray-observability/ray-logging.html#logging-directory-structure' to find where the log file is. 2023-07-26 22:40:00,989 ERROR services.py:1242 -- Couldn't read dashboard.log file. Error: [Errno 2] No such file or directory: '/tmp/ray/session_2023-07-26_22-39-59_815545_1359886/logs/dashboard.log'. It means the dashboard is broken even before it initializes the logger (mostly dependency issues). Reading the dashboard.err file which contains stdout/stderr.

Exception Traceback (most recent call last) [... skipping hidden 1 frame]

Cell In[10], line 2 1 from bigdl.chronos.autots.model.auto_prophet import AutoProphet ----> 2 auto_prophet = AutoProphet() 3 auto_prophet.fit(data=train_data, 4 cross_validation=True, 5 freq="1D")

File ~/.local/lib/python3.9/site-packages/bigdl/chronos/autots/model/auto_prophet.py:112, in AutoProphet.init(self, changepoint_prior_scale, seasonality_prior_scale, holidays_prior_scale, seasonality_mode, changepoint_range, metric, metric_mode, logs_dir, cpus_per_trial, name, remote_dir, load_dir, **prophet_config) 111 model_builder = ProphetBuilder() --> 112 self.auto_est = AutoEstimator(model_builder=model_builder, 113 logs_dir=logs_dir, 114 resources_per_trial={"cpu": cpus_per_trial}, 115 remote_dir=remote_dir, 116 name=name) 117 except ImportError:

File ~/.local/lib/python3.9/site-packages/bigdl/orca/automl/auto_estimator.py:53, in AutoEstimator.init(self, model_builder, logs_dir, resources_per_trial, remote_dir, name) 52 self.model_builder = model_builder ---> 53 self.searcher = SearchEngineFactory.create_engine( 54 backend="ray", 55 logs_dir=logs_dir, 56 resources_per_trial=resources_per_trial, 57 remote_dir=remote_dir, 58 name=name) 59 self._fitted = False

File ~/.local/lib/python3.9/site-packages/bigdl/orca/automl/search/init.py:25, in SearchEngineFactory.create_engine(backend, *args, *kwargs) 24 from bigdl.orca.automl.search.ray_tune import RayTuneSearchEngine ---> 25 return RayTuneSearchEngine(args, **kwargs)

File ~/.local/lib/python3.9/site-packages/bigdl/orca/automl/search/ray_tune/ray_tune_search_engine.py:53, in RayTuneSearchEngine.init(self, logs_dir, resources_per_trial, name, remote_dir) 52 self.name = name ---> 53 self.remote_dir = remote_dir or RayTuneSearchEngine.get_default_remote_dir(name) 54 self.logs_dir = os.path.abspath(os.path.expanduser(logs_dir))

File ~/.local/lib/python3.9/site-packages/bigdl/orca/automl/search/ray_tune/ray_tune_search_engine.py:60, in RayTuneSearchEngine.get_default_remote_dir(name) 59 from bigdl.orca.automl.search.utils import process ---> 60 ray_ctx = OrcaRayContext.get() 61 if ray_ctx.is_local:

File ~/.local/lib/python3.9/site-packages/bigdl/orca/ray/raycontext.py:103, in OrcaRayContext.get(cls, initialize) 102 if initialize and not ray_ctx.initialized: --> 103 ray_ctx.init() 104 return ray_ctx

File ~/.local/lib/python3.9/site-packages/bigdl/orca/ray/raycontext.py:77, in OrcaRayContext.init(self, driver_cores) 76 else: ---> 77 results = self._ray_on_spark_context.init(driver_cores=driver_cores) 78 self.num_ray_nodes = self._ray_on_spark_context.num_ray_nodes # type: ignore

File ~/.local/lib/python3.9/site-packages/bigdl/orca/ray/ray_on_spark_context.py:601, in RayOnSparkContext.init(self, driver_cores) 600 init_params.update(kwargs) --> 601 self._address_info = ray.init(**init_params) 602 else:

File ~/.local/lib/python3.9/site-packages/ray/_private/client_mode_hook.py:103, in client_mode_hook..wrapper(*args, kwargs) 102 return getattr(ray, func.name)(*args, *kwargs) --> 103 return func(args, kwargs)

File ~/.local/lib/python3.9/site-packages/ray/_private/worker.py:1514, in init(address, num_cpus, num_gpus, resources, labels, object_store_memory, local_mode, ignore_reinit_error, include_dashboard, dashboard_host, dashboard_port, job_config, configure_logging, logging_level, logging_format, log_to_driver, namespace, runtime_env, storage, **kwargs) 1510 # Start the Ray processes. We set shutdown_at_exit=False because we 1511 # shutdown the node in the ray.shutdown call that happens in the atexit 1512 # handler. We still spawn a reaper process in case the atexit handler 1513 # isn't called. -> 1514 _global_node = ray._private.node.Node( 1515 head=True, 1516 shutdown_at_exit=False, 1517 spawn_reaper=True, 1518 ray_params=ray_params, 1519 ) 1520 else: 1521 # In this case, we are connecting to an existing cluster.

File ~/.local/lib/python3.9/site-packages/ray/_private/node.py:287, in Node.init(self, ray_params, head, shutdown_at_exit, spawn_reaper, connect_only, default_worker) 286 if head: --> 287 self.start_head_processes() 289 if not connect_only:

File ~/.local/lib/python3.9/site-packages/ray/_private/node.py:1181, in Node.start_head_processes(self) 1179 raise_on_api_server_failure = True -> 1181 self.start_api_server( 1182 include_dashboard=include_dashboard, 1183 raise_on_failure=raise_on_api_server_failure, 1184 )

File ~/.local/lib/python3.9/site-packages/ray/_private/node.py:931, in Node.start_api_server(self, include_dashboard, raise_onfailure) 928 , stderr_file = self.get_log_file_handles( 929 "dashboard", unique=True, create_out=False 930 ) --> 931 self._webui_url, process_info = ray._private.services.start_api_server( 932 include_dashboard, 933 raise_on_failure, 934 self._ray_params.dashboard_host, 935 self.gcs_address, 936 self._node_ip_address, 937 self._temp_dir, 938 self._logs_dir, 939 self._session_dir, 940 port=self._ray_params.dashboard_port, 941 dashboard_grpc_port=self._ray_params.dashboard_grpc_port, 942 fate_share=self.kernel_fate_share, 943 max_bytes=self.max_bytes, 944 backup_count=self.backup_count, 945 redirect_logging=self.should_redirect_logs(), 946 stdout_file=stderr_file, 947 stderr_file=stderr_file, 948 ) 949 assert ray_constants.PROCESS_TYPE_DASHBOARD not in self.all_processes

File ~/.local/lib/python3.9/site-packages/ray/_private/services.py:1274, in start_api_server(include_dashboard, raise_on_failure, host, gcs_address, node_ip_address, temp_dir, logdir, session_dir, port, dashboard_grpc_port, fate_share, max_bytes, backup_count, redirect_logging, stdout_file, stderr_file) 1273 if raise_on_failure: -> 1274 raise e from e 1275 else:

File ~/.local/lib/python3.9/site-packages/ray/_private/services.py:1262, in start_api_server(include_dashboard, raise_on_failure, host, gcs_address, node_ip_address, temp_dir, logdir, session_dir, port, dashboard_grpc_port, fate_share, max_bytes, backup_count, redirect_logging, stdout_file, stderr_file) 1261 last_log_str = "\n" + "\n".join(reversed(lines[-lines_to_read:])) -> 1262 raise Exception(last_log_str) 1263 else: 1264 # Is it reachable?

Exception: The last 20 lines of /tmp/ray/session_2023-07-26_22-39-59_815545_1359886/logs/dashboard.err (it contains the error message from the dashboard): import site = 1 sys._base_executable = '/usr/bin/python3' sys.base_prefix = '/usr' sys.base_exec_prefix = '/usr' sys.platlibdir = 'lib' sys.executable = '/usr/bin/python3' sys.prefix = '/usr' sys.exec_prefix = '/usr' sys.path = [ '/home/siawchen/Downloads/spark-3.2.2-bin-hadoop3.2/python/lib/pyspark.zip', '/home/siawchen/Downloads/spark-3.2.2-bin-hadoop3.2/python/lib/py4j-0.10.9.5-src.zip', '/usr/lib/python39.zip', '/usr/lib/python3.9', '/usr/lib/python3.9/lib-dynload', ] Fatal Python error: init_fs_encoding: failed to get the Python codec of the filesystem encoding Python runtime state: core initialized ModuleNotFoundError: No module named 'encodings'

Current thread 0x00007f424a5a3740 (most recent call first):

The above exception was the direct cause of the following exception: