I am trying to get the ax_multiobjective_nas_tutorial.ipnb tutorial running on my local machine. I came until experiment running part without any problem, but when I start running the experiment, all the trials fail. I didn't change anything in the original notebook. This is the output:
I tried running it on Google colab but got the same error.
File ~/anaconda3/envs/tpot/lib/python3.10/site-packages/ax/service/scheduler.py:999, in Scheduler.run_all_trials(self, timeout_hours, idle_callback)
992 if self.options.total_trials is None:
993 # NOTE: Capping on number of trials will likely be needed as fallback
994 # for most stopping criteria, so we ensure num_trials is specified.
995 raise ValueError( # pragma: no cover
996 "Please either specify num_trials in SchedulerOptions input "
997 "to the Scheduler or use run_n_trials instead of run_all_trials."
998 )
--> 999 for _ in self.run_trials_and_yield_results(
1000 max_trials=not_none(self.options.total_trials),
1001 timeout_hours=timeout_hours,
1002 idle_callback=idle_callback,
1003 ):
1004 pass
1005 return self.summarize_final_result()
File ~/anaconda3/envs/tpot/lib/python3.10/site-packages/ax/service/scheduler.py:854, in Scheduler.run_trials_and_yield_results(self, max_trials, ignore_global_stopping_strategy, timeout_hours, idle_callback)
849 n_remaining_to_run = max_trials
850 while (
851 not self.should_consider_optimization_complete()[0]
852 and n_remaining_to_run > 0
853 ):
--> 854 if self.should_abort_optimization():
855 yield self._abort_optimization(num_preexisting_trials=n_existing)
856 return
File ~/anaconda3/envs/tpot/lib/python3.10/site-packages/ax/service/scheduler.py:712, in Scheduler.should_abort_optimization(self)
707 """Checks whether this scheduler has reached some intertuption / abort
708 criterion, such as an overall optimization timeout, tolerated failure rate, etc.
709 """
710 # if failure rate is exceeded, raise an exception.
711 # this check should precede others to ensure it is not skipped.
--> 712 self.error_if_failure_rate_exceeded()
714 # if optimization is timed out, return True, else return False
715 timed_out = (
716 self._timeout_hours is not None
717 and self._latest_optimization_start_timestamp is not None
(...)
720 >= not_none(self._timeout_hours) 60 60 * 1000
721 )
File ~/anaconda3/envs/tpot/lib/python3.10/site-packages/ax/service/scheduler.py:779, in Scheduler.error_if_failure_rate_exceeded(self, force_check)
771 if self._num_trials_bad_due_to_err > num_bad_in_scheduler / 2:
772 self.logger.warn(
773 "MetricFetchE INFO: Sweep aborted due to an exceeded error rate, "
774 "which was primarily caused by failure to fetch metrics. Please "
775 "check if anything could cause your metrics to be flakey or "
776 "broken."
777 )
--> 779 raise self._get_failure_rate_exceeded_error(
780 num_bad_in_scheduler=num_bad_in_scheduler,
781 num_ran_in_scheduler=num_ran_in_scheduler,
782 )
FailureRateExceededError: Failure rate exceeds the tolerated trial failure rate of 0.5 (at least 8 out of first 8 trials failed). Checks are triggered both at the end of a optimization and if at least 5 trials have failed.
What do you think might be the problem here? Thank you.
https://pytorch.org/tutorials/intermediate/ax_multiobjective_nas_tutorial.html
Hi,
I am trying to get the ax_multiobjective_nas_tutorial.ipnb tutorial running on my local machine. I came until experiment running part without any problem, but when I start running the experiment, all the trials fail. I didn't change anything in the original notebook. This is the output:
I tried running it on Google colab but got the same error.
Full log:
FailureRateExceededError Traceback (most recent call last) Cell In[11], line 1 ----> 1 scheduler.run_all_trials()
File ~/anaconda3/envs/tpot/lib/python3.10/site-packages/ax/service/scheduler.py:999, in Scheduler.run_all_trials(self, timeout_hours, idle_callback) 992 if self.options.total_trials is None: 993 # NOTE: Capping on number of trials will likely be needed as fallback 994 # for most stopping criteria, so we ensure
num_trials
is specified. 995 raise ValueError( # pragma: no cover 996 "Please either specifynum_trials
inSchedulerOptions
input " 997 "to theScheduler
or userun_n_trials
instead ofrun_all_trials
." 998 ) --> 999 for _ in self.run_trials_and_yield_results( 1000 max_trials=not_none(self.options.total_trials), 1001 timeout_hours=timeout_hours, 1002 idle_callback=idle_callback, 1003 ): 1004 pass 1005 return self.summarize_final_result()File ~/anaconda3/envs/tpot/lib/python3.10/site-packages/ax/service/scheduler.py:854, in Scheduler.run_trials_and_yield_results(self, max_trials, ignore_global_stopping_strategy, timeout_hours, idle_callback) 849 n_remaining_to_run = max_trials 850 while ( 851 not self.should_consider_optimization_complete()[0] 852 and n_remaining_to_run > 0 853 ): --> 854 if self.should_abort_optimization(): 855 yield self._abort_optimization(num_preexisting_trials=n_existing) 856 return
File ~/anaconda3/envs/tpot/lib/python3.10/site-packages/ax/service/scheduler.py:712, in Scheduler.should_abort_optimization(self) 707 """Checks whether this scheduler has reached some intertuption / abort 708 criterion, such as an overall optimization timeout, tolerated failure rate, etc. 709 """ 710 # if failure rate is exceeded, raise an exception. 711 # this check should precede others to ensure it is not skipped. --> 712 self.error_if_failure_rate_exceeded() 714 # if optimization is timed out, return True, else return False 715 timed_out = ( 716 self._timeout_hours is not None 717 and self._latest_optimization_start_timestamp is not None (...) 720 >= not_none(self._timeout_hours) 60 60 * 1000 721 )
File ~/anaconda3/envs/tpot/lib/python3.10/site-packages/ax/service/scheduler.py:779, in Scheduler.error_if_failure_rate_exceeded(self, force_check) 771 if self._num_trials_bad_due_to_err > num_bad_in_scheduler / 2: 772 self.logger.warn( 773 "MetricFetchE INFO: Sweep aborted due to an exceeded error rate, " 774 "which was primarily caused by failure to fetch metrics. Please " 775 "check if anything could cause your metrics to be flakey or " 776 "broken." 777 ) --> 779 raise self._get_failure_rate_exceeded_error( 780 num_bad_in_scheduler=num_bad_in_scheduler, 781 num_ran_in_scheduler=num_ran_in_scheduler, 782 )
FailureRateExceededError: Failure rate exceeds the tolerated trial failure rate of 0.5 (at least 8 out of first 8 trials failed). Checks are triggered both at the end of a optimization and if at least 5 trials have failed.
What do you think might be the problem here? Thank you.
Best, Emre