Training image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.6.0-gpu-py3
Instance type: ml.p3.2xlarge
checkpoint_local_path = "/state"
==================
Traceback (most recent call last):
File "main.py", line 543, in
main()
File "main.py", line 181, in main
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
File "/opt/conda/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 200, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/opt/conda/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 158, in start_processes
while not context.join():
File "/opt/conda/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 119, in join
raise Exception(msg)
Exception:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/opt/conda/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 20, in _wrap
fn(i, *args)
File "/opt/ml/code/main.py", line 349, in main_worker
train(train_loader, model, criterion, optimizer, epoch, args)
File "/opt/ml/code/main.py", line 390, in train
for i, (images, target) in enumerate(train_loader):
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 363, in next
data = self._next_data()
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 989, in _next_data
return self._process_data(data)
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1014, in _process_data
data.reraise()
File "/opt/conda/lib/python3.6/site-packages/torch/_utils.py", line 395, in reraise
raise self.exc_type(msg)
FileNotFoundError: Caught FileNotFoundError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 185, in _worker_loop
data = fetcher.fetch(index)
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/opt/conda/lib/python3.6/site-packages/torchvision/datasets/folder.py", line 139, in getitem
sample = self.transform(sample)
File "/opt/conda/lib/python3.6/site-packages/torchvision/transforms/transforms.py", line 61, in call
img = t(img)
File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 724, in _call_impl
result = hook(self, input)
File "/opt/conda/lib/python3.6/site-packages/smdebug/pytorch/hook.py", line 133, in forward_pre_hook
self._increment_step()
File "/opt/conda/lib/python3.6/site-packages/smdebug/core/hook.py", line 517, in _increment_step
self._write_state()
File "/opt/conda/lib/python3.6/site-packages/smdebug/core/hook.py", line 529, in _write_state
if self.state_store.is_checkpoint_updated():
File "/opt/conda/lib/python3.6/site-packages/smdebug/core/state_store.py", line 99, in is_checkpoint_updated
timestamps = [os.path.getmtime(file) for file in checkpoint_files]
File "/opt/conda/lib/python3.6/site-packages/smdebug/core/state_store.py", line 99, in
timestamps = [os.path.getmtime(file) for file in checkpoint_files]
File "/opt/conda/lib/python3.6/genericpath.py", line 55, in getmtime
return os.stat(filename).st_mtime
FileNotFoundError: [Errno 2] No such file or directory: '/state/metadata.json.sagemaker-uploaded'
It seems like files get deleted between the time the list of files is created checkpoint_files = self._get_checkpoint_files_in_dir(self._checkpoint_dir) - https://github.com/awslabs/sagemaker-debugger/blob/master/smdebug/core/state_store.py#L92 to timestamps = [os.path.getmtime(file) for file in checkpoint_files] - https://github.com/awslabs/sagemaker-debugger/blob/master/smdebug/core/state_store.py#L99
==================
Training image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.6.0-gpu-py3 Instance type: ml.p3.2xlarge checkpoint_local_path = "/state"
==================
Traceback (most recent call last): File "main.py", line 543, in
main()
File "main.py", line 181, in main
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
File "/opt/conda/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 200, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/opt/conda/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 158, in start_processes
while not context.join():
File "/opt/conda/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 119, in join
raise Exception(msg)
Exception:
-- Process 0 terminated with the following error: Traceback (most recent call last): File "/opt/conda/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 20, in _wrap fn(i, *args) File "/opt/ml/code/main.py", line 349, in main_worker train(train_loader, model, criterion, optimizer, epoch, args) File "/opt/ml/code/main.py", line 390, in train for i, (images, target) in enumerate(train_loader): File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 363, in next data = self._next_data() File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 989, in _next_data return self._process_data(data) File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1014, in _process_data data.reraise() File "/opt/conda/lib/python3.6/site-packages/torch/_utils.py", line 395, in reraise raise self.exc_type(msg) FileNotFoundError: Caught FileNotFoundError in DataLoader worker process 0. Original Traceback (most recent call last): File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 185, in _worker_loop data = fetcher.fetch(index) File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch data = [self.dataset[idx] for idx in possibly_batched_index] File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/opt/conda/lib/python3.6/site-packages/torchvision/datasets/folder.py", line 139, in getitem
sample = self.transform(sample)
File "/opt/conda/lib/python3.6/site-packages/torchvision/transforms/transforms.py", line 61, in call
img = t(img)
File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 724, in _call_impl
result = hook(self, input)
File "/opt/conda/lib/python3.6/site-packages/smdebug/pytorch/hook.py", line 133, in forward_pre_hook
self._increment_step()
File "/opt/conda/lib/python3.6/site-packages/smdebug/core/hook.py", line 517, in _increment_step
self._write_state()
File "/opt/conda/lib/python3.6/site-packages/smdebug/core/hook.py", line 529, in _write_state
if self.state_store.is_checkpoint_updated():
File "/opt/conda/lib/python3.6/site-packages/smdebug/core/state_store.py", line 99, in is_checkpoint_updated
timestamps = [os.path.getmtime(file) for file in checkpoint_files]
File "/opt/conda/lib/python3.6/site-packages/smdebug/core/state_store.py", line 99, in
timestamps = [os.path.getmtime(file) for file in checkpoint_files]
File "/opt/conda/lib/python3.6/genericpath.py", line 55, in getmtime
return os.stat(filename).st_mtime
FileNotFoundError: [Errno 2] No such file or directory: '/state/metadata.json.sagemaker-uploaded'