Closed wjgaas closed 3 years ago
it seems a dead lock
You can modify the vedacore/loopers/epoch_based_looper.py file like the following, and have a try.
import time
from .base_looper import BaseLooper
class EpochBasedLooper(BaseLooper):
def __init__(self, modes, dataloaders, engines, hook_pool, logger,
workdir):
super().__init__(modes, dataloaders, engines, hook_pool, logger,
workdir)
def epoch_loop(self, mode):
self.mode = mode
dataloader = self.dataloaders[mode]
engine = self.engines[mode]
time.sleep(2) # Prevent possible deadlock during epoch transition
for idx, data in enumerate(dataloader):
self.hook_pool.fire(f'before_{mode}_iter', self)
self.cur_results[mode] = engine(data)
if mode == BaseLooper.TRAIN:
self._iter += 1
self._inner_iter = idx + 1
self.hook_pool.fire(f'after_{mode}_iter', self)
def start(self, max_epochs):
self.hook_pool.fire('before_run', self)
while self.epoch < max_epochs:
for mode in self.modes:
mode = mode.lower()
self.hook_pool.fire(f'before_{mode}_epoch', self)
self.epoch_loop(mode)
if mode == BaseLooper.TRAIN:
self._epoch += 1
self.hook_pool.fire(f'after_{mode}_epoch', self)
if len(self.modes) == 1 and self.modes[0] == EpochBasedLooper.VAL:
break
You can modify the vedacore/loopers/epoch_based_looper.py file like the following, and have a try.
import time from .base_looper import BaseLooper class EpochBasedLooper(BaseLooper): def __init__(self, modes, dataloaders, engines, hook_pool, logger, workdir): super().__init__(modes, dataloaders, engines, hook_pool, logger, workdir) def epoch_loop(self, mode): self.mode = mode dataloader = self.dataloaders[mode] engine = self.engines[mode] time.sleep(2) # Prevent possible deadlock during epoch transition for idx, data in enumerate(dataloader): self.hook_pool.fire(f'before_{mode}_iter', self) self.cur_results[mode] = engine(data) if mode == BaseLooper.TRAIN: self._iter += 1 self._inner_iter = idx + 1 self.hook_pool.fire(f'after_{mode}_iter', self) def start(self, max_epochs): self.hook_pool.fire('before_run', self) while self.epoch < max_epochs: for mode in self.modes: mode = mode.lower() self.hook_pool.fire(f'before_{mode}_epoch', self) self.epoch_loop(mode) if mode == BaseLooper.TRAIN: self._epoch += 1 self.hook_pool.fire(f'after_{mode}_epoch', self) if len(self.modes) == 1 and self.modes[0] == EpochBasedLooper.VAL: break
this did not works for me, it still pause at the same iteration after adding time.sleep(2) . it is ok when images is less than 5w,but when i have more images,the dead lock occur.
You can modify the vedacore/loopers/epoch_based_looper.py file like the following, and have a try.
import time from .base_looper import BaseLooper class EpochBasedLooper(BaseLooper): def __init__(self, modes, dataloaders, engines, hook_pool, logger, workdir): super().__init__(modes, dataloaders, engines, hook_pool, logger, workdir) def epoch_loop(self, mode): self.mode = mode dataloader = self.dataloaders[mode] engine = self.engines[mode] time.sleep(2) # Prevent possible deadlock during epoch transition for idx, data in enumerate(dataloader): self.hook_pool.fire(f'before_{mode}_iter', self) self.cur_results[mode] = engine(data) if mode == BaseLooper.TRAIN: self._iter += 1 self._inner_iter = idx + 1 self.hook_pool.fire(f'after_{mode}_iter', self) def start(self, max_epochs): self.hook_pool.fire('before_run', self) while self.epoch < max_epochs: for mode in self.modes: mode = mode.lower() self.hook_pool.fire(f'before_{mode}_epoch', self) self.epoch_loop(mode) if mode == BaseLooper.TRAIN: self._epoch += 1 self.hook_pool.fire(f'after_{mode}_epoch', self) if len(self.modes) == 1 and self.modes[0] == EpochBasedLooper.VAL: break
this did not works for me, it still pause at the same iteration after adding time.sleep(2) . it is ok when images is less than 5w,but when i have more images,the dead lock occur.
It seems strange, I can train RetinaNet on COCO which contains approximate 12w images.
Hi @wjgaas, Can you show me your config?
Hi @wjgaas, Can you show me your config?
I did not change other configuration in the tinaface except trainning image list
Hi @wjgaas, Can you show me your config?
I did not change other configuration in the tinaface except trainning image list
I guess it may be stuck at the 'RandomSquareCrop' transform, if there is no gt bbox in some image. And If it is indeed this case, you can overwrite the '_filter_imgs' func in dataset like 'coco.py'.
2020-12-16 20:11:46,235 - vedadet - INFO - Epoch [1][1400/4732] lr: 0.00375, loss_cls: 0.2021, loss_bbox: 0.5325, loss_iou: 0.5654, loss: 1.2999 2020-12-16 20:12:33,962 - vedadet - INFO - Epoch [1][1500/4732] lr: 0.00375, loss_cls: 0.2406, loss_bbox: 0.6577, loss_iou: 0.6214, loss: 1.5197
File "tools/trainval.py", line 65, in
main()
File "tools/trainval.py", line 61, in main
trainval(cfg, distributed, logger)
File "/mnt/data2/code/vedadet2/vedadet/assembler/trainval.py", line 78, in trainval
looper.start(cfg.max_epochs)
File "/mnt/data2/code/vedadet2/vedacore/loopers/epoch_based_looper.py", line 29, in start
self.epoch_loop(mode)
File "/mnt/data2/code/vedadet2/vedacore/loopers/epoch_based_looper.py", line 15, in epoch_loop
for idx, data in enumerate(dataloader):
File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 363, in next
data = self._next_data()
File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 974, in _next_data
idx, data = self._get_data()
File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 941, in _get_data
success, data = self._try_get_data()
File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 779, in _try_get_data
data = self._data_queue.get(timeout=timeout)
File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/multiprocessing/queues.py", line 104, in get
if not self._poll(timeout):
File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/multiprocessing/connection.py", line 257, in poll
return self._poll(timeout)
File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/multiprocessing/connection.py", line 414, in _poll
r = wait([self], timeout)
File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/multiprocessing/connection.py", line 921, in wait
ready = selector.select(timeout)
File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/selectors.py", line 415, in select
fd_event_list = self._selector.poll(timeout)