Closed muzishen closed 4 years ago
Which dataset are you trying to train on ? Did you change anything in the configuration file ? Could you paste your config file here ?
Thank you for your reply . i don not change any configuration without pretrained and path. And config file as follows:
model = dict( type='CascadeRCNN', num_stages=3, pretrained=None, backbone=dict( type='HRNet', extra=dict( stage1=dict( num_modules=1, num_branches=1, block='BOTTLENECK', num_blocks=(4, ), num_channels=(64, )), stage2=dict( num_modules=1, num_branches=2, block='BASIC', num_blocks=(4, 4), num_channels=(32, 64)), stage3=dict( num_modules=4, num_branches=3, block='BASIC', num_blocks=(4, 4, 4), num_channels=(32, 64, 128)), stage4=dict( num_modules=3, num_branches=4, block='BASIC', num_blocks=(4, 4, 4, 4), num_channels=(32, 64, 128, 256)) ),
norm_eval=False,
),
neck=dict(
type='HRFPN',
in_channels=[32, 64, 128, 256],
out_channels=256),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_scales=[8],
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[4, 8, 16, 32, 64],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(
type='RoIAlign',
out_size=7,
sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=[
dict(
type='SharedFCBBoxHead',
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=2,
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2],
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(
type='SmoothL1Loss',
beta=1.0,
loss_weight=1.0)),
dict(
type='SharedFCBBoxHead',
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=2,
target_means=[0., 0., 0., 0.],
target_stds=[0.05, 0.05, 0.1, 0.1],
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(
type='SmoothL1Loss',
beta=1.0,
loss_weight=1.0)),
dict(
type='SharedFCBBoxHead',
num_fcs=2,
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=2,
target_means=[0., 0., 0., 0.],
target_stds=[0.033, 0.033, 0.067, 0.067],
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(
type='SmoothL1Loss',
beta=1.0,
loss_weight=1.0)),
],
mask_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
mask_head=dict(
type='FCNMaskHead',
num_convs=4,
in_channels=256,
conv_out_channels=256,
num_classes=2,
loss_mask=dict(
type='CrossEntropyLoss', use_mask=True, loss_weight=1))
)
train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=0.7), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=0.7), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=0.7), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=0.7), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.001, nms=dict(type='soft_nms', iou_thr=0.5), max_per_img=100, mask_thr_binary=0.5), keep_all_stages=False)
dataset_type = 'CocoDataset' data_root = 'datasets/CrowdHuman/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) data = dict( imgs_per_gpu=1, workers_per_gpu=5, train=dict( type=dataset_type,
# data_root + 'val.json'],
# img_prefix=[data_root + 'Images',
# data_root + 'Images_val'],
ann_file=data_root + 'train.json',
img_prefix=data_root + 'Images',
img_scale=[(1216, 608),(2048, 1024)],
multiscale_mode='range',
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0.5,
with_mask=True,
with_crowd=True,
with_label=True,
extra_aug=dict(
photo_metric_distortion=dict(brightness_delta=180, contrast_range=(0.5, 1.5),
saturation_range=(0.5, 1.5), hue_delta=18),
random_crop=dict(min_ious=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9), min_crop_size=0.1),
),
),
test=dict(
type=dataset_type,
ann_file=data_root + 'val.json',
img_prefix=data_root + 'Images_val',
img_scale=(2048, 1024),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_label=False,
test_mode=True))
mean_teacher=True optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2), mean_teacher = dict(alpha=0.999))
lr_config = dict( policy='cosine', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[110, 160]) checkpoint_config = dict(interval=1)
log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'),
])
total_epochs = 240 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/cityperson_cascade_rcnn_hrnetv2p_w32' load_from = None resume_from = None workflow = [('train', 1)]
Config seems fine and I cannot reproduce this issue locally. Now can you please paste your PyTorch etc. versions ?
My env as follows:
_libgcc_mutex=0.1=main _pytorch_select=0.2=gpu_0 blas=1.0=mkl ca-certificates=2020.1.1=0 certifi=2020.4.5.1=py37_0 cffi=1.14.0=py37h2e261b9_0 cudatoolkit=10.0.130=0 cudnn=7.6.5=cuda10.0_0 cycler=0.10.0=py37_0 cython=0.29.17=py37he6710b0_0 freetype=2.9.1=h8a8886c_1 intel-openmp=2020.0=166 jpeg=9b=h024ee3a_2 libedit=3.1.20181209=hc058e9b_0 libffi=3.2.1=hd88cf55_4 libgcc-ng=9.1.0=hdf63c60_0 libgfortran-ng=7.3.0=hdf63c60_0 libpng=1.6.37=hbc83047_0 libstdcxx-ng=9.1.0=hdf63c60_0 libtiff=4.1.0=h2733197_0 mkl=2020.0=166 mkl-service=2.3.0=py37he904b0f_0 mkl_fft=1.0.15=py37ha843d7b_0 mkl_random=1.1.0=py37hd6b4f25_0 ncurses=6.2=he6710b0_1 ninja=1.9.0=py37hfd86e86_0 numpy=1.18.1=py37h4f9e942_0 numpy-base=1.18.1=py37hde5b4d6_1 olefile=0.46=py_0 openssl=1.0.2u=h7b6447c_0 pillow=7.1.2=py37hb39fc2d_0 pip=20.0.2=py37_1 pycparser=2.20=py_0 pyparsing=2.4.7=py_0 python=3.7.0=h6e4f718_3 pytorch=1.2.0=cuda100py37h938c94c_0 readline=7.0=h7b6447c_5 setuptools=46.1.3=py37_0 six=1.14.0=py37_0 sqlite=3.31.1=h62c20be_1 tk=8.6.8=hbc83047_0 torchvision=0.4.0=cuda100py37hecfc37a_0 wheel=0.34.2=py37_0 xz=5.2.5=h7b6447c_0 zlib=1.2.11=h7b6447c_3 zstd=1.3.7=h0b5b093_0
Can you run the demo.py (using any of the pre-trained model) ?
I can run the demo.py (use crowdhuman pre-trained model). Only l want to train on crowdhuman from scratch.
I cannot reproduce the error. I have just cloned the repo and tried training on crowdhuman from the scratch. It is running fine on my system (500 iterations so far). So, it can be either an issue in the installation (did it compile normally after running python setup.py develop
?) or there is some incompatibility issue between the libs etc. One more thing as a caveat, in case you are running mmdetection
framework as well for some other projects, make sure you have a separate environment for Pedestron. Since, mmdetection
has been upgraded to a newer version (mmdet and mmcv), at least in my case I faced some issues.
Thank you , there is no error in the compilation. l have created an independent conda environment. I don not run mmmdetection framework on another project. Can you provide a full environment file? And i have a question that the num_classes are set to 2 in the config file. Why not set to 3 (person, mask and background)?
# packages in environment at envs/Pedestron:
#
# Name Version Build Channel
_libgcc_mutex 0.1 main
addict 2.2.1 pypi_0 pypi
attrdict 2.0.1 pypi_0 pypi
blas 1.0 mkl
bzip2 1.0.6 h9a117a8_4
ca-certificates 2019.11.27 0
caffe2-cuda8.0-cudnn7 0.8.dev py36_2018.05.14 caffe2
cairo 1.14.12 h7636065_2
certifi 2019.11.28 py36_0
cffi 1.11.5 py36h9745a5d_0
chardet 3.0.4 pypi_0 pypi
cudatoolkit 9.0 h13b8566_0
cycler 0.10.0 py36h93f1223_0
cython 0.29.13 pypi_0 pypi
dbus 1.13.2 h714fa37_1
decorator 4.4.1 pypi_0 pypi
expat 2.2.5 he0dffb1_0
ffmpeg 3.4 h7264315_0
fontconfig 2.12.6 h49f89f6_0
freeglut 2.8.1 0 https://repo.continuum.io/pkgs/free
freetype 2.8 hab7d2ae_1
future 0.16.0 py36_1 https://repo.continuum.io/pkgs/free
gflags 2.2.1 hf484d3e_0
glib 2.56.1 h000015b_0
glog 0.3.5 hf484d3e_1
graphite2 1.3.11 hf63cedd_1
gst-plugins-base 1.14.0 hbbd80ab_1
gstreamer 1.14.0 hb453b48_1
h5py 2.8.0 py36hca9c191_0
harfbuzz 1.7.6 h5f0a787_1
hdf5 1.8.18 h6792536_1
icu 58.2 h9c2bf20_1
idna 2.8 pypi_0 pypi
imageio 2.6.1 pypi_0 pypi
intel-openmp 2018.0.0 8
jasper 2.0.14 h07fcdf6_0
jpeg 9b h024ee3a_2
kiwisolver 1.0.1 py36h764f252_0
libedit 3.1 heed3624_0
libffi 3.2.1 hd88cf55_4
libgcc-ng 7.2.0 hdf63c60_3
libgfortran-ng 7.2.0 hdf63c60_3
libglu 9.0.0 h0c0bdc1_1
libopus 1.2.1 hb9ed12e_0
libpng 1.6.34 hb9fc6fc_0
libprotobuf 3.5.2 h6f1eeef_0
libstdcxx-ng 7.2.0 hdf63c60_3
libtiff 4.0.9 h28f6b97_0
libvpx 1.6.1 h888fd40_0
libxcb 1.13 h1bed415_1
libxml2 2.9.8 hf84eae3_0
matplotlib 2.2.2 py36h0e671d2_1
mkl 2018.0.2 1
mkl_fft 1.0.1 py36h3010b51_0
mkl_random 1.0.1 py36h629b387_0
mmcv 0.2.14 pypi_0 pypi
mmdet 0.6.0+47c8e78 dev_0 <develop>
ncurses 6.0 h9df7e31_2
networkx 2.4 pypi_0 pypi
ninja 1.8.2 py36h6bb024c_1
numpy 1.14.3 py36hcd700cb_1
numpy-base 1.14.3 py36h9be14a7_1
olefile 0.45.1 py36_0
opencv 3.3.1 py36h9248ab4_2
opencv-python 4.1.1.26 pypi_0 pypi
openssl 1.0.2p h14c3975_0
pandas 0.23.4 py36h04863e7_0
patsy 0.5.1 pypi_0 pypi
pcre 8.42 h439df22_0
pillow 5.1.0 py36h3deb7b8_0
pip 19.2.3 pypi_0 pypi
pixman 0.34.0 hceecf20_3
protobuf 3.5.2 py36_0 conda-forge
psutil 5.4.6 py36h14c3975_0 anaconda
pycocotools 2.0 pypi_0 pypi
pycparser 2.18 py36hf9f622e_1
pyparsing 2.2.0 py36hee85983_1
pyqt 5.9.2 py36h751905a_0
python 3.6.5 hc3d631a_2
python-dateutil 2.7.2 py36_0
pytorch 1.1.0 py3.6_cuda9.0.176_cudnn7.5.1_0 pytorch
pytz 2018.4 py36_0
pywavelets 1.1.1 pypi_0 pypi
pyyaml 3.12 py36hafb9ca4_1
qt 5.9.5 h7e424d6_0
readline 7.0 ha6073c6_4
requests 2.22.0 pypi_0 pypi
scikit-image 0.16.2 pypi_0 pypi
scikit-learn 0.19.1 py36h7aa7ec6_0
scipy 1.1.0 py36hfc37229_0
seaborn 0.9.0 pyh91ea838_1
setuptools 39.1.0 py36_0
sip 4.19.8 py36hf484d3e_0
six 1.11.0 py36h372c433_1
sqlite 3.23.1 he433501_0
statsmodels 0.10.1 pypi_0 pypi
tbb 2018.0.4 h6bb024c_1 anaconda
tbb4py 2018.0.4 py36h6bb024c_1 anaconda
tensorboardx 1.4 pypi_0 pypi
tk 8.6.7 hc745277_3
torchvision 0.3.0 py36_cu9.0.176_1 pytorch
tornado 5.0.2 py36_0
tqdm 4.23.0 py36_0
urllib3 1.25.7 pypi_0 pypi
wheel 0.31.0 py36_0
xz 5.2.3 h5e939de_4
yaml 0.1.7 had09818_2
zlib 1.2.11 ha838bed_2
Lots of unwanted packages as well, ignore them.
Thank you , there is no error in the compilation. l have created an independent conda environment. I don not run mmmdetection framework on another project. Can you provide a full environment file? And i have a question that the num_classes are set to 2 in the config file. Why not set to 3 (person, mask and background)?
For the reason, Pedestron generates mask on the fly #13. Therefore, we are not really interested in the masks. You can change it if you want.
Thank you, I have copied your environment and it has run successfully. I want to know how to load your pre-training model and let it continue to train on crowdhuman dataset? And can you tell me how much the MAP on crowdhuman dataset?
In the config file, you have load_from = None
. Change None
to the model you want to load from, for example,load_from = 'my_path/epoch_1.pth'
. I think the published model has an AP around ~84.
When I try to load your pre-trained model (epoch_19.pth.stu), it reported an error.
As follows:
OSError: epoch_19.pth is not a checkpoint file
Yes, you actually need a teacher model not a student one (not .stu but .pth). Can you try simply making a copy of epoch_19.pth.stu and name it epoch_19.pth and pass this .pth model. See if it works.
You are right! Thank you for your patient reply!
that's ok
@hasanirtiza
what is the difference between teacher model and student model?
In both models weights are saved slightly differently. It is a method that does some sort of smoothing of weights during training and they are helpful. You can read more about it in this NIPS paper
What is the cause of this problem? How did you solve it? @muzishen
According the arthor reply,Yes, you actually need a teacher model not a student one (not .stu but .pth). Can you try simply making a copy of epoch_19.pth.stu and name it epoch_19.pth and pass this .pth model. See if it works.
Sorry, I don't think I made myself clear. I mean this problem ’RuntimeError: Expected object of scalar type Byte but got scalar type Bool for argument #2 'other'‘ @muzishen
@yuanvq Actually this is an error that I have also reproduced locally. Most probably it is down to the environment (mmdet, mmcv torch etc), make sure versions are as suggested by the repo. Another issue is that, if you are also running mmdetection (newer versions) or any other repo that uses mmdetection and you do not have separate environments, you might also encounter this problem.
By the way did you manage to solve it ?
that's ok
@hasanirtiza I solve the problem with this modification. I think it's could be a version problem. Thanks for your reply.
File "/media/xx/xx/Pedestron/mmdet/models/anchor_heads/rpn_head.py", line 51, in loss gt_bboxes_ignore=gt_bboxes_ignore) File "/media/xx/xx/Pedestron/mmdet/core/fp16/decorators.py", line 127, in new_func return old_func(*args, *kwargs) File "/media/xx/xx/Pedestron/mmdet/models/anchor_heads/anchor_head.py", line 179, in loss sampling=self.sampling) File "/media/xx/xx/Pedestron/mmdet/core/anchor/anchor_target.py", line 63, in anchor_target unmap_outputs=unmap_outputs) File "/media/xx/xx/Pedestron/mmdet/core/utils/misc.py", line 24, in multi_apply return tuple(map(list, zip(map_results))) File "/media/xx/xx/Pedestron/mmdet/core/anchor/anchor_target.py", line 108, in anchor_target_single cfg.allowed_border) File "/media/xx/xx/Pedestron/mmdet/core/anchor/anchor_target.py", line 176, in anchor_inside_flags (flat_anchors[:, 2] < img_w + allowed_border) & \ RuntimeError: Expected object of scalar type Byte but got scalar type Bool for argument #2 'other'