Media-Smart / vedadet

A single stage object detection toolbox based on PyTorch
Apache License 2.0
498 stars 128 forks source link

training pause at waiting for data when trianing dataset increase to one hundred thousand images? #20

Closed wjgaas closed 3 years ago

wjgaas commented 3 years ago

2020-12-16 20:11:46,235 - vedadet - INFO - Epoch [1][1400/4732] lr: 0.00375, loss_cls: 0.2021, loss_bbox: 0.5325, loss_iou: 0.5654, loss: 1.2999 2020-12-16 20:12:33,962 - vedadet - INFO - Epoch [1][1500/4732] lr: 0.00375, loss_cls: 0.2406, loss_bbox: 0.6577, loss_iou: 0.6214, loss: 1.5197

File "tools/trainval.py", line 65, in main() File "tools/trainval.py", line 61, in main trainval(cfg, distributed, logger) File "/mnt/data2/code/vedadet2/vedadet/assembler/trainval.py", line 78, in trainval looper.start(cfg.max_epochs) File "/mnt/data2/code/vedadet2/vedacore/loopers/epoch_based_looper.py", line 29, in start self.epoch_loop(mode) File "/mnt/data2/code/vedadet2/vedacore/loopers/epoch_based_looper.py", line 15, in epoch_loop for idx, data in enumerate(dataloader): File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 363, in next data = self._next_data() File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 974, in _next_data idx, data = self._get_data() File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 941, in _get_data success, data = self._try_get_data() File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 779, in _try_get_data data = self._data_queue.get(timeout=timeout) File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/multiprocessing/queues.py", line 104, in get if not self._poll(timeout): File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/multiprocessing/connection.py", line 257, in poll return self._poll(timeout) File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/multiprocessing/connection.py", line 414, in _poll r = wait([self], timeout) File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/multiprocessing/connection.py", line 921, in wait ready = selector.select(timeout) File "/home/environment/anaconda2/envs/vedadet2/lib/python3.7/selectors.py", line 415, in select fd_event_list = self._selector.poll(timeout)

wjgaas commented 3 years ago

it seems a dead lock

mileistone commented 3 years ago

You can modify the vedacore/loopers/epoch_based_looper.py file like the following, and have a try.

import time
from .base_looper import BaseLooper

class EpochBasedLooper(BaseLooper):

    def __init__(self, modes, dataloaders, engines, hook_pool, logger,
                 workdir):
        super().__init__(modes, dataloaders, engines, hook_pool, logger,
                         workdir)

    def epoch_loop(self, mode):
        self.mode = mode
        dataloader = self.dataloaders[mode]
        engine = self.engines[mode]
        time.sleep(2)  # Prevent possible deadlock during epoch transition
        for idx, data in enumerate(dataloader):
            self.hook_pool.fire(f'before_{mode}_iter', self)
            self.cur_results[mode] = engine(data)
            if mode == BaseLooper.TRAIN:
                self._iter += 1
            self._inner_iter = idx + 1
            self.hook_pool.fire(f'after_{mode}_iter', self)

    def start(self, max_epochs):
        self.hook_pool.fire('before_run', self)
        while self.epoch < max_epochs:
            for mode in self.modes:
                mode = mode.lower()
                self.hook_pool.fire(f'before_{mode}_epoch', self)
                self.epoch_loop(mode)
                if mode == BaseLooper.TRAIN:
                    self._epoch += 1
                self.hook_pool.fire(f'after_{mode}_epoch', self)
            if len(self.modes) == 1 and self.modes[0] == EpochBasedLooper.VAL:
                break
wjgaas commented 3 years ago

You can modify the vedacore/loopers/epoch_based_looper.py file like the following, and have a try.

import time
from .base_looper import BaseLooper

class EpochBasedLooper(BaseLooper):

    def __init__(self, modes, dataloaders, engines, hook_pool, logger,
                 workdir):
        super().__init__(modes, dataloaders, engines, hook_pool, logger,
                         workdir)

    def epoch_loop(self, mode):
        self.mode = mode
        dataloader = self.dataloaders[mode]
        engine = self.engines[mode]
        time.sleep(2)  # Prevent possible deadlock during epoch transition
        for idx, data in enumerate(dataloader):
            self.hook_pool.fire(f'before_{mode}_iter', self)
            self.cur_results[mode] = engine(data)
            if mode == BaseLooper.TRAIN:
                self._iter += 1
            self._inner_iter = idx + 1
            self.hook_pool.fire(f'after_{mode}_iter', self)

    def start(self, max_epochs):
        self.hook_pool.fire('before_run', self)
        while self.epoch < max_epochs:
            for mode in self.modes:
                mode = mode.lower()
                self.hook_pool.fire(f'before_{mode}_epoch', self)
                self.epoch_loop(mode)
                if mode == BaseLooper.TRAIN:
                    self._epoch += 1
                self.hook_pool.fire(f'after_{mode}_epoch', self)
            if len(self.modes) == 1 and self.modes[0] == EpochBasedLooper.VAL:
                break

this did not works for me, it still pause at the same iteration after adding time.sleep(2) . it is ok when images is less than 5w,but when i have more images,the dead lock occur.

mileistone commented 3 years ago

You can modify the vedacore/loopers/epoch_based_looper.py file like the following, and have a try.

import time
from .base_looper import BaseLooper

class EpochBasedLooper(BaseLooper):

    def __init__(self, modes, dataloaders, engines, hook_pool, logger,
                 workdir):
        super().__init__(modes, dataloaders, engines, hook_pool, logger,
                         workdir)

    def epoch_loop(self, mode):
        self.mode = mode
        dataloader = self.dataloaders[mode]
        engine = self.engines[mode]
        time.sleep(2)  # Prevent possible deadlock during epoch transition
        for idx, data in enumerate(dataloader):
            self.hook_pool.fire(f'before_{mode}_iter', self)
            self.cur_results[mode] = engine(data)
            if mode == BaseLooper.TRAIN:
                self._iter += 1
            self._inner_iter = idx + 1
            self.hook_pool.fire(f'after_{mode}_iter', self)

    def start(self, max_epochs):
        self.hook_pool.fire('before_run', self)
        while self.epoch < max_epochs:
            for mode in self.modes:
                mode = mode.lower()
                self.hook_pool.fire(f'before_{mode}_epoch', self)
                self.epoch_loop(mode)
                if mode == BaseLooper.TRAIN:
                    self._epoch += 1
                self.hook_pool.fire(f'after_{mode}_epoch', self)
            if len(self.modes) == 1 and self.modes[0] == EpochBasedLooper.VAL:
                break

this did not works for me, it still pause at the same iteration after adding time.sleep(2) . it is ok when images is less than 5w,but when i have more images,the dead lock occur.

It seems strange, I can train RetinaNet on COCO which contains approximate 12w images.

mike112223 commented 3 years ago

Hi @wjgaas, Can you show me your config?

wjgaas commented 3 years ago

Hi @wjgaas, Can you show me your config?

I did not change other configuration in the tinaface except trainning image list

mike112223 commented 3 years ago

Hi @wjgaas, Can you show me your config?

I did not change other configuration in the tinaface except trainning image list

I guess it may be stuck at the 'RandomSquareCrop' transform, if there is no gt bbox in some image. And If it is indeed this case, you can overwrite the '_filter_imgs' func in dataset like 'coco.py'.