Open stas00 opened 3 years ago
And there was some in the pipe.py as well, overall any chance that these can be made silent by default or configurable by the user:
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 5791b3f..539cb6a 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -2033,7 +2033,7 @@ class DeepSpeedEngine(Module):
script = "zero_to_fp32.py"
src = os.path.join(base_dir, "utils", script)
dst = os.path.join(save_path, script)
- logger.info(f"creating recovery script {dst}")
+ #logger.info(f"creating recovery script {dst}")
copyfile(src, dst)
# make executable
os.chmod(dst, os.stat(dst).st_mode | stat.S_IEXEC)
@@ -2046,7 +2046,7 @@ class DeepSpeedEngine(Module):
ds_version=version)
torch.save(zero_sd, zero_checkpoint_name)
self._copy_recovery_script(save_path)
- logger.info('zero checkpoint saved {}'.format(zero_checkpoint_name))
+ #logger.info('zero checkpoint saved {}'.format(zero_checkpoint_name))
def _zero3_consolidated_fp16_state_dict(self):
"""
diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
index af4df2a..977ce59 100644
--- a/deepspeed/runtime/pipe/module.py
+++ b/deepspeed/runtime/pipe/module.py
@@ -577,10 +577,10 @@ class PipelineModule(nn.Module):
layer.load_state_dict(checkpoint)
- if self._grid.data_parallel_id == 0:
- logger.info(
- f'RANK={self.global_rank} Loaded layer={idx+self._local_start} file={load_path}'
- )
+ # if self._grid.data_parallel_id == 0:
+ # logger.info(
+ # f'RANK={self.global_rank} Loaded layer={idx+self._local_start} file={load_path}'
+ # )
self._synchronize_tied_weights()
diff --git a/deepspeed/runtime/state_dict_factory.py b/deepspeed/runtime/state_dict_factory.py
index 033b672..f5562e1 100755
--- a/deepspeed/runtime/state_dict_factory.py
+++ b/deepspeed/runtime/state_dict_factory.py
@@ -78,7 +78,7 @@ class SDLoaderBase(ABC):
merge_count = 1
if num_ckpt == mp_world_size:
assert os.path.exists(load_path)
- logger.info(f'rank: {mp_rank} loading checkpoint: {load_path}')
+ #logger.info(f'rank: {mp_rank} loading checkpoint: {load_path}')
sd = torch.load(load_path, map_location=lambda storage, loc: storage)
if quantize:
@@ -158,7 +158,7 @@ class SDLoaderBase(ABC):
return sd
def check_ckpt_list(self):
- logger.info(f'checkpoint file list: {self.ckpt_list}')
+ #logger.info(f'checkpoint file list: {self.ckpt_list}')
assert len(self.ckpt_list) > 0
sd = torch.load(self.ckpt_list[0], map_location=lambda storage, loc: storage)
Now it's again possible to monitor the progress of the training.
Using PP with z1 results in copious logs being dumped. Please consider the scope of 64 or 256 or 512 gpus.
https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/engine.py#L2035-L2037
I don't know why
_copy_recovery_script
gets called for each gpu, despite:it works correctly w/o PP, but under PP seems never be true.
And the next logger line gets called for each gpu as well
https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/state_dict_factory.py#L165
and then again here:
https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/state_dict_factory.py#L85
These logs make it very difficult to follow the progress of the training, since each dump is hundreds of lines-long.
There should be some sort of debug logs and not printed normally as they don't contribute anything other 1 line x n_gpus of logs.
Thank you!
@tjruwase