Open kshatadit opened 4 years ago
pid
and camid
are supposed to contain int rather than str, check this https://kaiyangzhou.github.io/deep-person-reid/user_guide.html#use-your-own-dataset
You are right! I might have missed that somehow while following the documentation. I changed all the strings to integers. However, after running the engine.run() block i still got the following error:-
##### Evaluating new_dataset (source) #####
Extracting features from query set ...
---------------------------------------------------------------------------
Empty Traceback (most recent call last)
c:\users\aditya kshatriya\appdata\local\conda\conda\envs\torchreid\lib\site-packages\torch\utils\data\dataloader.py in _try_get_batch(self, timeout)
510 try:
--> 511 data = self.data_queue.get(timeout=timeout)
512 return (True, data)
c:\users\aditya kshatriya\appdata\local\conda\conda\envs\torchreid\lib\queue.py in get(self, block, timeout)
177 if remaining <= 0.0:
--> 178 raise Empty
179 self.not_empty.wait(remaining)
Empty:
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
<ipython-input-11-c191e60e38d6> in <module>
4 eval_freq=10,
5 print_freq=10,
----> 6 test_only=True
7 )
~\Capstone\pytorchreid\deep-person-reid\torchreid\engine\engine.py in run(self, save_dir, max_epoch, start_epoch, print_freq, fixbase_epoch, open_layers, start_eval, eval_freq, test_only, dist_metric, normalize_feature, visrank, visrank_topk, use_metric_cuhk03, ranks, rerank)
178 use_metric_cuhk03=use_metric_cuhk03,
179 ranks=ranks,
--> 180 rerank=rerank
181 )
182 return
~\Capstone\pytorchreid\deep-person-reid\torchreid\engine\engine.py in test(self, epoch, dist_metric, normalize_feature, visrank, visrank_topk, save_dir, use_metric_cuhk03, ranks, rerank)
342 use_metric_cuhk03=use_metric_cuhk03,
343 ranks=ranks,
--> 344 rerank=rerank
345 )
346
c:\users\aditya kshatriya\appdata\local\conda\conda\envs\torchreid\lib\site-packages\torch\autograd\grad_mode.py in decorate_no_grad(*args, **kwargs)
41 def decorate_no_grad(*args, **kwargs):
42 with self:
---> 43 return func(*args, **kwargs)
44 return decorate_no_grad
45
~\Capstone\pytorchreid\deep-person-reid\torchreid\engine\engine.py in _evaluate(self, epoch, dataset_name, query_loader, gallery_loader, dist_metric, normalize_feature, visrank, visrank_topk, save_dir, use_metric_cuhk03, ranks, rerank)
384
385 print('Extracting features from query set ...')
--> 386 qf, q_pids, q_camids = _feature_extraction(query_loader)
387 print('Done, obtained {}-by-{} matrix'.format(qf.size(0), qf.size(1)))
388
~\Capstone\pytorchreid\deep-person-reid\torchreid\engine\engine.py in _feature_extraction(data_loader)
367 def _feature_extraction(data_loader):
368 f_, pids_, camids_ = [], [], []
--> 369 for batch_idx, data in enumerate(data_loader):
370 imgs, pids, camids = self.parse_data_for_eval(data)
371 if self.use_gpu:
c:\users\aditya kshatriya\appdata\local\conda\conda\envs\torchreid\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
574 while True:
575 assert (not self.shutdown and self.batches_outstanding > 0)
--> 576 idx, batch = self._get_batch()
577 self.batches_outstanding -= 1
578 if idx != self.rcvd_idx:
c:\users\aditya kshatriya\appdata\local\conda\conda\envs\torchreid\lib\site-packages\torch\utils\data\dataloader.py in _get_batch(self)
541 elif self.pin_memory:
542 while self.pin_memory_thread.is_alive():
--> 543 success, data = self._try_get_batch()
544 if success:
545 return data
c:\users\aditya kshatriya\appdata\local\conda\conda\envs\torchreid\lib\site-packages\torch\utils\data\dataloader.py in _try_get_batch(self, timeout)
517 if not all(w.is_alive() for w in self.workers):
518 pids_str = ', '.join(str(w.pid) for w in self.workers if not w.is_alive())
--> 519 raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str))
520 if isinstance(e, queue.Empty):
521 return (False, None)
RuntimeError: DataLoader worker (pid(s) 3720, 7960, 14348, 1180) exited unexpectedly
This seems to be the data loader problem.
What do you see when you do this
for batch in datamanager.train_loader:
imgs = batch[0]
pids = batch[1]
camids = batch[2]
print(imgs.shape, pids.shape, camids.shape)
print(len(batch))
break
When I do
for batch in datamanager.train_loader:
imgs = batch[0]
pids = batch[1]
camids = batch[2]
print(imgs.shape, pids.shape, camids.shape)
print(len(batch))
break
I get this
BrokenPipeError Traceback (most recent call last)
<ipython-input-5-35bdfd5f639f> in <module>
----> 1 for batch in datamanager.train_loader:
2 imgs = batch[0]
3 pids = batch[1]
4 camids = batch[2]
5 print(imgs.shape, pids.shape, camids.shape)
c:\users\aditya kshatriya\appdata\local\conda\conda\envs\torchreid\lib\site-packages\torch\utils\data\dataloader.py in __iter__(self)
191
192 def __iter__(self):
--> 193 return _DataLoaderIter(self)
194
195 def __len__(self):
c:\users\aditya kshatriya\appdata\local\conda\conda\envs\torchreid\lib\site-packages\torch\utils\data\dataloader.py in __init__(self, loader)
467 # before it starts, and __del__ tries to join but will get:
468 # AssertionError: can only join a started process.
--> 469 w.start()
470 self.index_queues.append(index_queue)
471 self.workers.append(w)
c:\users\aditya kshatriya\appdata\local\conda\conda\envs\torchreid\lib\multiprocessing\process.py in start(self)
110 'daemonic processes are not allowed to have children'
111 _cleanup()
--> 112 self._popen = self._Popen(self)
113 self._sentinel = self._popen.sentinel
114 # Avoid a refcycle if the target function holds an indirect
c:\users\aditya kshatriya\appdata\local\conda\conda\envs\torchreid\lib\multiprocessing\context.py in _Popen(process_obj)
221 @staticmethod
222 def _Popen(process_obj):
--> 223 return _default_context.get_context().Process._Popen(process_obj)
224
225 class DefaultContext(BaseContext):
c:\users\aditya kshatriya\appdata\local\conda\conda\envs\torchreid\lib\multiprocessing\context.py in _Popen(process_obj)
320 def _Popen(process_obj):
321 from .popen_spawn_win32 import Popen
--> 322 return Popen(process_obj)
323
324 class SpawnContext(BaseContext):
c:\users\aditya kshatriya\appdata\local\conda\conda\envs\torchreid\lib\multiprocessing\popen_spawn_win32.py in __init__(self, process_obj)
87 try:
88 reduction.dump(prep_data, to_child)
---> 89 reduction.dump(process_obj, to_child)
90 finally:
91 set_spawning_popen(None)
c:\users\aditya kshatriya\appdata\local\conda\conda\envs\torchreid\lib\multiprocessing\reduction.py in dump(obj, file, protocol)
58 def dump(obj, file, protocol=None):
59 '''Replacement for pickle.dump() using ForkingPickler.'''
---> 60 ForkingPickler(file, protocol).dump(obj)
61
62 #
BrokenPipeError: [Errno 32] Broken pipe
The same code works properly for the market1501 dataset though
hmm, then that's clear that there is sth wrong with your dataset
it's hard for me to debug in this situation
I'd suggest you check carefully if you did sth wrong somewhere, and have a look at https://github.com/KaiyangZhou/deep-person-reid/blob/master/torchreid/data/datasets/dataset.py#L12
Dear Kaiyang, thank you so much for sharing your repo, I appreciate all the efforts you've taken
I have come across an issue that I am struggling to understand.
I have created a small dataset, which was meant only for testing purpose, 74 images in train with 5 camera ids, 10 in gallery with 4 cam ids and 2 images in query with 2 cam ids. All of these just 2 pids.
The process i followed was:-
import sys import os import os.path as osp
from torchreid.data import ImageDataset
class NewDataset(ImageDataset): dataset_dir='newimages' def init(self, root='', **kwargs): self.root = osp.abspath(osp.expanduser(root)) self.dataset_dir = osp.join(self.root, self.dataset_dir)
giving path to train dataset
torchreid.data.register_image_dataset('new_dataset', NewDataset)
datamanager = torchreid.data.ImageDataManager( root='deep-person-reid', sources='new_dataset' )
Loaded NewDataset
subset | # ids | # images | # cameras
train | 2 | 74 | 5 query | 2 | 2 | 2 gallery | 2 | 10 | 4
=> Loading test (target) dataset => Loaded NewDataset
subset | # ids | # images | # cameras
train | 2 | 74 | 5 query | 2 | 2 | 2 gallery | 2 | 10 | 4
**** Summary **** source : ['new_dataset1']
source datasets : 1
source ids : 2
source images : 74
source cameras : 5
target : ['new_dataset1']
model = torchreid.models.build_model( name='resnet50', num_classes=datamanager.num_train_pids, loss='softmax', pretrained=True )
model = model.cuda()
optimizer = torchreid.optim.build_optimizer( model, optim='adam', lr=0.0003 )
scheduler = torchreid.optim.build_lr_scheduler( optimizer, lr_scheduler='single_step', stepsize=20 )
engine = torchreid.engine.ImageSoftmaxEngine( datamanager, model, optimizer=optimizer, scheduler=scheduler, label_smooth=True )
engine.run( save_dir='log/resnet50', max_epoch=60, eval_freq=10, print_freq=10, test_only=False )
Start training Traceback (most recent call last): Traceback (most recent call last): Traceback (most recent call last): File "", line 1, in
Traceback (most recent call last):
File "", line 1, in
File "", line 1, in
File "C:\Users\Aditya Kshatriya\AppData\Local\conda\conda\envs\torchreid\lib\multiprocessing\spawn.py", line 105, in spawn_main
File "", line 1, in
File "C:\Users\Aditya Kshatriya\AppData\Local\conda\conda\envs\torchreid\lib\multiprocessing\spawn.py", line 105, in spawn_main
File "C:\Users\Aditya Kshatriya\AppData\Local\conda\conda\envs\torchreid\lib\multiprocessing\spawn.py", line 105, in spawn_main
exitcode = _main(fd)
File "C:\Users\Aditya Kshatriya\AppData\Local\conda\conda\envs\torchreid\lib\multiprocessing\spawn.py", line 105, in spawn_main
exitcode = _main(fd)
exitcode = _main(fd)
File "C:\Users\Aditya Kshatriya\AppData\Local\conda\conda\envs\torchreid\lib\multiprocessing\spawn.py", line 115, in _main
exitcode = _main(fd)
File "C:\Users\Aditya Kshatriya\AppData\Local\conda\conda\envs\torchreid\lib\multiprocessing\spawn.py", line 115, in _main
File "C:\Users\Aditya Kshatriya\AppData\Local\conda\conda\envs\torchreid\lib\multiprocessing\spawn.py", line 115, in _main
self = reduction.pickle.load(from_parent)
File "C:\Users\Aditya Kshatriya\AppData\Local\conda\conda\envs\torchreid\lib\multiprocessing\spawn.py", line 115, in _main
self = reduction.pickle.load(from_parent)
self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'NewDataset' on <module 'main' (built-in)>
self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'NewDataset' on <module 'main' (built-in)>
AttributeError: Can't get attribute 'NewDataset' on <module 'main' (built-in)>
AttributeError: Can't get attribute 'NewDataset' on <module 'main' (built-in)>
Traceback (most recent call last):
File "C:\Users\Aditya Kshatriya\AppData\Local\conda\conda\envs\torchreid\lib\site-packages\torch\utils\data\dataloader.py", line 511, in _try_get_batch
data = self.data_queue.get(timeout=timeout)
File "C:\Users\Aditya Kshatriya\AppData\Local\conda\conda\envs\torchreid\lib\queue.py", line 178, in get
raise Empty
_queue.Empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "", line 6, in
File "C:\Users\Aditya Kshatriya\Capstone\pytorchreid\deep-person-reid\torchreid\engine\engine.py", line 196, in run
open_layers=open_layers
File "C:\Users\Aditya Kshatriya\Capstone\pytorchreid\deep-person-reid\torchreid\engine\engine.py", line 250, in train
for self.batch_idx, data in enumerate(self.train_loader):
File "C:\Users\Aditya Kshatriya\AppData\Local\conda\conda\envs\torchreid\lib\site-packages\torch\utils\data\dataloader.py", line 576, in next
idx, batch = self._get_batch()
File "C:\Users\Aditya Kshatriya\AppData\Local\conda\conda\envs\torchreid\lib\site-packages\torch\utils\data\dataloader.py", line 543, in _get_batch
success, data = self._try_get_batch()
File "C:\Users\Aditya Kshatriya\AppData\Local\conda\conda\envs\torchreid\lib\site-packages\torch\utils\data\dataloader.py", line 519, in _try_get_batch
raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str))
RuntimeError: DataLoader worker (pid(s) 5920, 2788, 22620, 10548) exited unexpectedly