Breakend / experiment-impact-tracker

MIT License
273 stars 31 forks source link

Hard exit when using ray #64

Open Breakend opened 3 years ago

Breakend commented 3 years ago

When using ray, there is a hard exit where we always see an stack trace printed because of sys.exit being called on worker nodes. Is there a way to exit more gracefully in these situations?

(pid=42763) experiment_impact_tracker.compute_tracker.ImpactTracker - ERROR - Encountered exception within power monitor thread!
(pid=42763) ERROR:Encountered exception within power monitor thread!
(pid=42763) INFO:Done - Logging final info.
(pid=42763) /u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/site-packages/experiment_impact_tracker/data_utils
.py:30: FutureWarning: pandas.io.json.json_normalize is deprecated, use pandas.json_normalize instead
(pid=42763)   return json_normalize(json_array, max_level=max_level), json_array
(pid=42763) experiment_impact_tracker.compute_tracker.ImpactTracker - ERROR -   File
"/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/site-packages/experiment_impact_tracker/compute_tracker.py",
line 161, in launch_power_monitor
(pid=42763)     _sample_and_log_power(log_dir, initial_info, logger=logger)
(pid=42763)   File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3
.7/site-packages/experiment_impact_tracker/compute_tracker.py", line 112, in _sample_and_log_power
(pid=42763)     log_dir=log_dir,
(pid=42763)   File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3
.7/site-packages/experiment_impact_tracker/gpu/nvidia.py", line 127, in get_nvidia_gpu_power
(pid=42763)     out_str = sp.communicate()
(pid=42763)   File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/subprocess.py", line 964, in communicate
(pid=42763)     stdout, stderr = self._communicate(input, endtime, timeout)
(pid=42763)   File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/subprocess.py", line 1715, in _communicate
(pid=42763)     ready = selector.select(timeout)
(pid=42763)   File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/selectors.py", line 415, in select
(pid=42763)     fd_event_list = self._selector.poll(timeout)
(pid=42763)   File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/site-packages/ray/worker.py", line 392, in

sigterm_handler
(pid=42763)     sys.exit(1)
(pid=42763)
(pid=42763) ERROR:  File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3
.7/site-packages/experiment_impact_tracker/compute_tracker.py", line 161, in launch_power_monitor
(pid=42763)     _sample_and_log_power(log_dir, initial_info, logger=logger)
(pid=42763)   File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3
.7/site-packages/experiment_impact_tracker/compute_tracker.py", line 112, in _sample_and_log_power
(pid=42763)     log_dir=log_dir,
(pid=42763)   File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3
.7/site-packages/experiment_impact_tracker/gpu/nvidia.py", line 127, in get_nvidia_gpu_power
(pid=42763)     out_str = sp.communicate()
(pid=42763)   File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/subprocess.py", line 964, in communicate
(pid=42763)     stdout, stderr = self._communicate(input, endtime, timeout)
(pid=42763)   File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/subprocess.py", line 1715, in _communicate
(pid=42763)     ready = selector.select(timeout)
(pid=42763)   File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/selectors.py", line 415, in select
(pid=42763)     fd_event_list = self._selector.poll(timeout)
(pid=42763)   File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/site-packages/ray/worker.py", line 392, in

sigterm_handler
(pid=42763)     sys.exit(1)
(pid=29659) experiment_impact_tracker.compute_tracker.ImpactTracker - ERROR - Encountered exception within power monitor thread!
(pid=29659) ERROR:Encountered exception within power monitor thread!
INFO:time to complete: 0:01:39.574842
(pid=29659) experiment_impact_tracker.compute_tracker.ImpactTracker - ERROR -   File
"/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3.7/site-packages/experiment_impact_tracker/compute_tracker.py",
line

161, in launch_power_monitor
(pid=29659)     _sample_and_log_power(log_dir, initial_info, logger=logger)
(pid=29659)   File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/experiment_impact_tracker/compute_tracker.py", line 93, in _sample_and_log_power
(pid=29659)     required_headers = _get_compatible_data_headers(get_current_region_info_cached()[0])
(pid=29659)   File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/experiment_impact_tracker/compute_tracker.py", line 182, in _get_compatible_data_headers
(pid=29659)     if not compatability_fn(region=region):
(pid=29659)   File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/experiment_impact_tracker/cpu/common.py", line 32, in is_cpu_freq_compatible
(pid=29659)     test = [x._asdict() for x in psutil.cpu_freq(percpu=True)]
(pid=29659)   File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/ray/thirdparty_files/psutil/__init__.py", line 1859, in cpu_freq
(pid=29659)     ret = _psplatform.cpu_freq()
(pid=29659)   File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/ray/thirdparty_files/psutil/_pslinux.py", line 742, in cpu_freq
(pid=29659)     curr = cat(pjoin(path, "scaling_cur_freq"), fallback=None)
(pid=29659)   File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/ray/thirdparty_files/psutil/_pslinux.py", line 293, in cat
(pid=29659)     return f.read().strip()
(pid=29659)   File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3.7/site-packages/ray/worker.py", line 392, in
sigterm_handler
(pid=29659)     sys.exit(1)
(pid=29659)
(pid=29659) ERROR:  File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/experiment_impact_tracker/compute_tracker.py", line 161, in launch_power_monitor
(pid=29659)     _sample_and_log_power(log_dir, initial_info, logger=logger)
(pid=29659)   File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/experiment_impact_tracker/compute_tracker.py", line 93, in _sample_and_log_power
(pid=29659)     required_headers = _get_compatible_data_headers(get_current_region_info_cached()[0])
(pid=29659)   File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/experiment_impact_tracker/compute_tracker.py", line 182, in _get_compatible_data_headers
(pid=29659)     if not compatability_fn(region=region):
(pid=29659)   File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/experiment_impact_tracker/cpu/common.py", line 32, in is_cpu_freq_compatible
(pid=29659)     test = [x._asdict() for x in psutil.cpu_freq(percpu=True)]
(pid=29659)   File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/ray/thirdparty_files/psutil/__init__.py", line 1859, in cpu_freq
(pid=29659)     ret = _psplatform.cpu_freq()
(pid=29659)   File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/ray/thirdparty_files/psutil/_pslinux.py", line 742, in cpu_freq
(pid=29659)     curr = cat(pjoin(path, "scaling_cur_freq"), fallback=None)
(pid=29659)   File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/ray/thirdparty_files/psutil/_pslinux.py", line 293, in cat
(pid=29659)     return f.read().strip()
(pid=29659)   File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3.7/site-packages/ray/worker.py", line 392, in
sigterm_handler
(pid=29659)     sys.exit(1)

Code:

 remote_class = ray.remote(num_cpus=1, num_gpus=num_gpus)(
                TestClass
            ).remote()
            output = remote_class.run.remote(
                model_path=model_path,
                dataset_path=data_path,
                train_batch_size=train_batch_size,
                run_stats=run_stats,
            )

class TestClass(object):
    def run(cls, model_path: str, dataset_path, train_batch_size, run_stats):
        """
        Computes energy metrics for one training epoch
        """
        # First copy model_path to temp directory
        logging_path = os.path.join(
            ENERGY_LOGGING_DIR, run_stats["hyperopt_results"]["experiment_id"]
        )
        tempdir = os.path.join(logging_path, "temp_model")
        shutil.copytree(model_path, tempdir)
        model = AnonModel.load(tempdir)
        with ImpactTracker(logging_path):
            (
                _,
                _,
                _,
            ) = model.train(
                dataset=dataset_path,
                training_set_metadat=os.path.join(
                    tempdir, "training_set_metadata.json"
                ),
            )
        data_interface = DataInterface([logging_path])
        carbon_output = {
            "kg_carbon": data_interface.kg_carbon,
            "total_power": data_interface.total_power,
            "PUE": data_interface.PUE,
            "duration_of_train_step": data_interface.exp_len_hours,
        }
        shutil.rmtree(tempdir)
        return carbon_output