Closed burness closed 6 years ago
@burness where is the train_ssd.py coming from? could you provide a complete code to test with? What do you mean by your own VOCDetection dataset, did you modify the dataset for your use-case?
@vishaalkapoor could you please review this?
Yes, I make my own data to be VOC format, and here is my train_ssd.py
"""Train SSD"""
import argparse
import os
import logging
import time
import numpy as np
import mxnet as mx
from mxnet import nd
from mxnet import gluon
from mxnet import autograd
import gluoncv as gcv
from gluoncv import data as gdata
from gluoncv import utils as gutils
from gluoncv.model_zoo import get_model
from gluoncv.data.batchify import Tuple, Stack, Pad
from gluoncv.data.transforms.presets.ssd import SSDDefaultTrainTransform
from gluoncv.data.transforms.presets.ssd import SSDDefaultValTransform
from gluoncv.utils.metrics.voc_detection import VOC07MApMetric
from gluoncv.utils.metrics.coco_detection import COCODetectionMetric
from gluoncv.utils.metrics.accuracy import Accuracy
from own_voc_detection import VOCDetection
def parse_args():
parser = argparse.ArgumentParser(description='Train SSD networks.')
parser.add_argument('--network', type=str, default='vgg16_atrous',
help="Base network name which serves as feature extraction base.")
parser.add_argument('--data-shape', type=int, default=300,
help="Input data shape, use 300, 512.")
parser.add_argument('--batch-size', type=int, default=32,
help='Training mini-batch size')
parser.add_argument('--dataset', type=str, default='voc',
help='Training dataset. Now support voc.')
parser.add_argument('--num-workers', '-j', dest='num_workers', type=int,
default=4, help='Number of data workers, you can use larger '
'number to accelerate data loading, if you CPU and GPUs are powerful.')
parser.add_argument('--gpus', type=str, default='0',
help='Training with GPUs, you can specify 1,3 for example.')
parser.add_argument('--epochs', type=int, default=240,
help='Training epochs.')
parser.add_argument('--resume', type=str, default='',
help='Resume from previously saved parameters if not None. '
'For example, you can resume from ./ssd_xxx_0123.params')
parser.add_argument('--start-epoch', type=int, default=0,
help='Starting epoch for resuming, default is 0 for new training.'
'You can specify it to 100 for example to start from 100 epoch.')
parser.add_argument('--lr', type=float, default=0.001,
help='Learning rate, default is 0.001')
parser.add_argument('--lr-decay', type=float, default=0.1,
help='decay rate of learning rate. default is 0.1.')
parser.add_argument('--lr-decay-epoch', type=str, default='160,200',
help='epoches at which learning rate decays. default is 160,200.')
parser.add_argument('--momentum', type=float, default=0.9,
help='SGD momentum, default is 0.9')
parser.add_argument('--wd', type=float, default=0.0005,
help='Weight decay, default is 5e-4')
parser.add_argument('--log-interval', type=int, default=100,
help='Logging mini-batch interval. Default is 100.')
parser.add_argument('--save-prefix', type=str, default='',
help='Saving parameter prefix')
parser.add_argument('--save-interval', type=int, default=10,
help='Saving parameters epoch interval, best model will always be saved.')
parser.add_argument('--val-interval', type=int, default=1,
help='Epoch interval for validation, increase the number will reduce the '
'training time if validation is slow.')
parser.add_argument('--seed', type=int, default=233,
help='Random seed to be fixed.')
args = parser.parse_args()
return args
def get_dataset(dataset, args):
if dataset.lower() == 'voc':
train_dataset = VOCDetection(
splits=[(2007, 'trainval')])
val_dataset = gdata.VOCDetection(
splits=[(2007, 'test')])
val_metric = VOC07MApMetric(iou_thresh=0.5, class_names=val_dataset.classes)
elif dataset.lower() == 'coco':
train_dataset = gdata.COCODetection(splits='instances_train2017')
val_dataset = gdata.COCODetection(splits='instances_val2017', skip_empty=False)
val_metric = COCODetectionMetric(
val_dataset, args.save_prefix + '_eval', cleanup=True,
data_shape=(args.data_shape, args.data_shape))
# coco validation is slow, consider increase the validation interval
if args.val_interval == 1:
args.val_interval = 10
else:
raise NotImplementedError('Dataset: {} not implemented.'.format(dataset))
return train_dataset, val_dataset, val_metric
def get_dataloader(net, train_dataset, val_dataset, data_shape, batch_size, num_workers):
"""Get dataloader."""
width, height = data_shape, data_shape
# use fake data to generate fixed anchors for target generation
with autograd.train_mode():
_, _, anchors = net(mx.nd.zeros((1, 3, height, width)))
batchify_fn = Tuple(Stack(), Stack(), Stack()) # stack image, cls_targets, box_targets
train_loader = gluon.data.DataLoader(
train_dataset.transform(SSDDefaultTrainTransform(width, height, anchors)),
batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers)
val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1))
val_loader = gluon.data.DataLoader(
val_dataset.transform(SSDDefaultValTransform(width, height)),
batch_size, False, batchify_fn=val_batchify_fn, last_batch='keep', num_workers=num_workers)
return train_loader, val_loader
def save_params(net, best_map, current_map, epoch, save_interval, prefix):
current_map = float(current_map)
if current_map > best_map[0]:
best_map[0] = current_map
net.save_params('{:s}_best.params'.format(prefix, epoch, current_map))
with open(prefix+'_best_map.log', 'a') as f:
f.write('\n{:04d}:\t{:.4f}'.format(epoch, current_map))
if save_interval and epoch % save_interval == 0:
net.save_params('{:s}_{:04d}_{:.4f}.params'.format(prefix, epoch, current_map))
def validate(net, val_data, ctx, eval_metric):
"""Test on validation dataset."""
eval_metric.reset()
# set nms threshold and topk constraint
net.set_nms(nms_thresh=0.45, nms_topk=400)
net.hybridize()
for batch in val_data:
data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
det_bboxes = []
det_ids = []
det_scores = []
gt_bboxes = []
gt_ids = []
gt_difficults = []
for x, y in zip(data, label):
# get prediction results
ids, scores, bboxes = net(x)
det_ids.append(ids)
det_scores.append(scores)
# clip to image size
det_bboxes.append(bboxes.clip(0, batch[0].shape[2]))
# split ground truths
gt_ids.append(y.slice_axis(axis=-1, begin=4, end=5))
gt_bboxes.append(y.slice_axis(axis=-1, begin=0, end=4))
gt_difficults.append(y.slice_axis(axis=-1, begin=5, end=6) if y.shape[-1] > 5 else None)
# update metric
eval_metric.update(det_bboxes, det_ids, det_scores, gt_bboxes, gt_ids, gt_difficults)
return eval_metric.get()
def train(net, train_data, val_data, eval_metric, args):
"""Training pipeline"""
net.collect_params().reset_ctx(ctx)
trainer = gluon.Trainer(
net.collect_params(), 'sgd',
{'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum})
# lr decay policy
lr_decay = float(args.lr_decay)
lr_steps = sorted([float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()])
mbox_loss = gcv.loss.SSDMultiBoxLoss()
ce_metric = mx.metric.Loss('CrossEntropy')
smoothl1_metric = mx.metric.Loss('SmoothL1')
# set up logger
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.INFO)
log_file_path = args.save_prefix + '_train.log'
log_dir = os.path.dirname(log_file_path)
if log_dir and not os.path.exists(log_dir):
os.makedirs(log_dir)
fh = logging.FileHandler(log_file_path)
logger.addHandler(fh)
logger.info(args)
logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
best_map = [0]
for epoch in range(args.start_epoch, args.epochs):
while lr_steps and epoch >= lr_steps[0]:
new_lr = trainer.learning_rate * lr_decay
lr_steps.pop(0)
trainer.set_learning_rate(new_lr)
logger.info("[Epoch {}] Set learning rate to {}".format(epoch, new_lr))
ce_metric.reset()
smoothl1_metric.reset()
tic = time.time()
btic = time.time()
net.hybridize()
for i, batch in enumerate(train_data):
batch_size = batch[0].shape[0]
data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0)
with autograd.record():
cls_preds = []
box_preds = []
for x in data:
cls_pred, box_pred, _ = net(x)
cls_preds.append(cls_pred)
box_preds.append(box_pred)
sum_loss, cls_loss, box_loss = mbox_loss(
cls_preds, box_preds, cls_targets, box_targets)
autograd.backward(sum_loss)
# since we have already normalized the loss, we don't want to normalize
# by batch-size anymore
trainer.step(1)
ce_metric.update(0, [l * batch_size for l in cls_loss])
smoothl1_metric.update(0, [l * batch_size for l in box_loss])
if args.log_interval and not (i + 1) % args.log_interval:
name1, loss1 = ce_metric.get()
name2, loss2 = smoothl1_metric.get()
logger.info('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'.format(
epoch, i, batch_size/(time.time()-btic), name1, loss1, name2, loss2))
btic = time.time()
name1, loss1 = ce_metric.get()
name2, loss2 = smoothl1_metric.get()
logger.info('[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}'.format(
epoch, (time.time()-tic), name1, loss1, name2, loss2))
if not (epoch + 1) % args.val_interval:
# consider reduce the frequency of validation to save time
map_name, mean_ap = validate(net, val_data, ctx, eval_metric)
val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg))
current_map = float(mean_ap[-1])
else:
current_map = 0.
save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
if __name__ == '__main__':
args = parse_args()
# fix seed for mxnet, numpy and python builtin random generator.
gutils.random.seed(args.seed)
# training contexts
ctx = [mx.gpu(int(i)) for i in args.gpus.split(',') if i.strip()]
ctx = ctx if ctx else [mx.cpu()]
# network
net_name = '_'.join(('ssd', str(args.data_shape), args.network, args.dataset))
args.save_prefix += net_name
net = get_model(net_name, pretrained_base=True)
if args.resume.strip():
net.load_params(args.resume.strip())
else:
for param in net.collect_params().values():
if param._data is not None:
continue
param.initialize()
# training data
train_dataset, val_dataset, eval_metric = get_dataset(args.dataset, args)
train_data, val_data = get_dataloader(
net, train_dataset, val_dataset, args.data_shape, args.batch_size, args.num_workers)
# training
train(net, train_data, val_data, eval_metric, args)
Hi Burness,
Apologies for the delay, there was a mixup with my handle and another Vishaal K was tagged :)
I'm able to repro the situation you're seeing when building by source with the 1.2 branch.
Here are my instructions for reproducibility:
Observed apparent hanging after failure to read an image file. When I Ctrl-C I see the same stack trace:
Traceback (most recent call last):
File "train_ssd.py", line 253, in <module>
train(net, train_data, val_data, eval_metric, args)
File "train_ssd.py", line 185, in train
for i, batch in enumerate(train_data):
File "/work/mxnet/python/mxnet/gluon/data/dataloader.py", line 222, in next
return self.__next__()
File "/work/mxnet/python/mxnet/gluon/data/dataloader.py", line 218, in __next__
idx, batch = self._data_queue.get()
File "/usr/lib/python2.7/multiprocessing/queues.py", line 117, in get
res = self._recv()
File "/work/mxnet/python/mxnet/gluon/data/dataloader.py", line 87, in recv
buf = self.recv_bytes()
I'll have to step through the code to further understand things better, I'll update today. Thanks for your patience.
Vishaal
Hi @burness,
I noticed you converted the jpg files in the VOC dataset to png files as the line below is modified from .jpg to .png
self._image_path = os.path.join('{}', 'JPEGImages', '{}.png')
I'm unable to repro the exact situation without knowing the dimensions of the png file (what convert command did you use to convert the jpg to png).
Note: Previously, I was seeing the run hang because of a different issue - that is, the jpg files weren't found because the wrong extension was used. This was causing an extension in __get_item__ and after all the threads stopped, the program would be hanging.
Thanks! Vishaal
Hi, @vishaalkapoor Thanks for your help, I have not convert the jpeg data to png, I just use my own train data to be the voc format to reuse your api, I have test the data read and it maybe not the issue. As you see in pic 1, I have print the image shape and content, I have check the code, and I think it maybe the queue and multithread to cause the hang
@vishaalkapoor And the train data is acquired by LabelImg
Hi @burness, Are you able to repro the issue with a minimal example and post a list of instructions that I could follow including pointers to files to download? Perhaps an example involving one or two images? It will be difficult for me to repro otherwise. Many thanks! Vishaal
Hi @burness , Could you try to decrease "--num-workers" and try the experiments again?
@vishaalkapoor, I am sorry that recently I have no time to do it! I will do as quicky as possible. @xcgoner Thanks, I will do that with no multi workers but It will be not efficient
@burness Understood. My concern is that sometimes using multiple worker threads in the dataloader may exhaust the cpu resources and make the training stuck. You can try to set this option to be 1 or even 0 (0 will trigger a single-thread dataloader in a different mode) and check whether it is still stuck or not. So that we can identify what is exactly the problem there. By the way, I noticed that in your snapshot, there is a typo. It should be "num-workers" instead of "num-worker"
I meet same question, any update? thanks.
@tuoyuxiang would you mind giving exact repo steps. I was having difficulty reproducing burness's experiment. What code are your running, or better yet, what is a minimal snippet if the code, and what is a pointer to the data set with steps on any transformations on that set.
Thank you!
On Mon, Jul 30, 2018, 1:02 AM tuoyuxiang notifications@github.com wrote:
I meet same question, any update? thanks.
— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/apache/incubator-mxnet/issues/11763#issuecomment-408780460, or mute the thread https://github.com/notifications/unsubscribe-auth/Am8fCxmmWX4V4yQwnfDdjVkHFlEy6IHOks5uLr2qgaJpZM4VQQ6G .
@tuoyuxiang maybe you could po the reproduce code, Now I have no time to reproduce it. Thanks! Or I will do it maybe two or three week later.
See https://github.com/apache/incubator-mxnet/issues/11872
You shared memory is full then the program is stuck waiting more data to train.
@burness I am closing this issue as it's related to a more general issue rather than SSD training specifically. If you cannot fix the problem following #11872, feel free to ping me.
Thanks for your help, I will try to fix this issue later
发自我的 iPhone
在 2018年8月3日,上午6:49,Joshua Z. Zhang notifications@github.com 写道:
@burness I am closing this issue as it's related to a more general issue rather than SSD training specifically. If you cannot fix the problem following #11872, feel free to ping me.
— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub, or mute the thread.
Description
I run the
train_ssd.py
with my own voc dataset. I handle on during the data read like fllow:Environment info (Required)
Steps to reproduce
I write my own VOCDetection, here is the code:
and I replace the VOCDetection in
train_ssd.py
.What have you tried to solve it?
dataloader.py
, and it seem theself._data_buffer
is a empty dict and when run theidx, batch = self._data_queue.get()
, It will be hold on and never restore to run.