tornadomeet / mxnet-face

Using mxnet for face-related algorithm.
Apache License 2.0
544 stars 206 forks source link

Using Moon_Output to train vgg-face but can't work at all #16

Closed ghost closed 8 years ago

ghost commented 8 years ago

I want to implement the original Moon using the vgg-face net and your moon-loss function.But it didn't work and report no error just show training down.I do the following change based on your code. (1)I choose to use the crop_aligned images from the CeleA database. (2)When generate the .rec file, i use the resize=178 color=1 to resize and read RGB images. (3)I chang the weight of your loss funtion all to 1.0 to following the original Moon. (4)I choose vgg-face and just simply change the output layer to the new moon-output

I use the new moon-ouput to train your LCNN ,works well .But when i use the for my vgg-face net ,I show no error but just show train down :+1: [12:45:47] src/io/iter_image_recordio.cc:68: Loaded ImageList from /media/mt02/data1/dataset/CelebA/celeba_train.lst 162770 Image records [12:45:47] src/io/iter_image_recordio.cc:211: ImageRecordIOParser: ./celeba_train.rec, use 3 threads for decoding.. [12:45:47] src/io/iter_image_recordio.cc:68: Loaded ImageList from /media/mt02/data1/dataset/CelebA/celeba_val.lst 19867 Image records [12:45:47] src/io/iter_image_recordio.cc:211: ImageRecordIOParser: ./celeba_val.rec, use 3 threads for decoding.. INFO:root:Start training with [cpu(0)] Start training with [cpu(0)] trining done!

attached the training code i used

`def norm_stat(d): return mx.nd.norm(d)/np.sqrt(d.size) mon = mx.mon.Monitor(10, norm_stat)

def get_symbol(num_classes = 40):

define alexnet

data = mx.symbol.Variable(name="data")
# group 1
conv1_1 = mx.symbol.Convolution(data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1")
relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1")
conv1_2 = mx.symbol.Convolution(data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_2")
relu1_2 = mx.symbol.Activation(data=conv1_2, act_type="relu", name="relu1_2")
pool1 = mx.symbol.Pooling(data=relu1_2, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool1")
# group 2
conv2_1 = mx.symbol.Convolution(data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_1")
relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1")
conv2_2 = mx.symbol.Convolution(data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_2")
relu2_2 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_2")
pool2 = mx.symbol.Pooling(data=relu2_1, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool2")
# group 3
conv3_1 = mx.symbol.Convolution(data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_1")
relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1")
conv3_2 = mx.symbol.Convolution(data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_2")
relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2")
conv3_3 = mx.symbol.Convolution(data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_3")
relu3_3 = mx.symbol.Activation(data=conv3_3, act_type="relu", name="relu3_3")
pool3 = mx.symbol.Pooling(data=relu3_3, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool3")
# group 4
conv4_1 = mx.symbol.Convolution(data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_1")
relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1")
conv4_2 = mx.symbol.Convolution(data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_2")
relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2")
conv4_3 = mx.symbol.Convolution(data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_3")
relu4_3 = mx.symbol.Activation(data=conv4_3, act_type="relu", name="relu4_3")
pool4 = mx.symbol.Pooling(data=relu4_3, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool4")
# group 5
conv5_1 = mx.symbol.Convolution(data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_1")
relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1")
conv5_2 = mx.symbol.Convolution(data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_2")
relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2")
conv5_3 = mx.symbol.Convolution(data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_3")
relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3")
pool5 = mx.symbol.Pooling(data=relu5_3, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool5")
# group 6
f1c6 = mx.symbol.FullyConnected(data=pool5, num_hidden=4096, name="f1c6",attr={'lr_mult':'0.1'})
relu6 = mx.symbol.Activation(data=f1c6, act_type="relu", name="relu6")
drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6")
# group 7
f1c7 = mx.symbol.FullyConnected(data=drop6, num_hidden=4096, name="f1c7",attr={'lr_mult':'0.1'})
relu7 = mx.symbol.Activation(data=f1c7, act_type="relu", name="relu7")
drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7")
# output
f1c8 = mx.symbol.FullyConnected(data=drop7, num_hidden=num_classes, name="f1c8",attr={'lr_mult':'0.1'})
moon = mx.symbol.MoonOutput(data=f1c8, src_dist_path='./src_dict.txt', name='Moon')
return moon

def main():

logging

if 'log_file' in args and args.log_file is not None:
    log_file = args.log_file
    log_dir = args.log_dir
    log_file_full_name = os.path.join(log_dir, log_file)
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)
    logger = logging.getLogger()
    handler = logging.FileHandler(log_file_full_name)
    logger.addHandler(handler)
    handler = logging.StreamHandler()
    logger.addHandler(handler)
    logger.setLevel(logging.DEBUG)
    logger.info('start with arguments %s', args)
else:
    logging.basicConfig(level=logging.DEBUG, format=head)
    logging.info('start with arguments %s', args)
# symbol = lightened_moon(num_classes=40, use_fuse=False)
symbol = get_symbol(num_classes=40)
devs = mx.cpu() if args.gpus is None else [mx.gpu(int(i)) for i in args.gpus.split(',')]
epoch_size = args.num_examples / args.batch_size
checkpoint = mx.callback.do_checkpoint(args.model_save_prefix)
kv = mx.kvstore.create(args.kv_store)
arg_params = None
aux_params = None
if args.retrain:
    _, arg_params, aux_params = mx.model.load_checkpoint(args.model_load_prefix, args.model_load_epoch)
train = mx.io.ImageRecordIter(
    path_imglist = args.list_dir + 'celeba_train.lst',
    path_imgrec = args.data_dir + "celeba_train.rec",
    label_width = 40,
    data_name   = 'data',
    label_name  = 'Moon_label',
    data_shape  = (3, 178, 178),
    scale       = 1./255,
    batch_size  = args.batch_size,
    rand_crop   = True,
    rand_mirror = True,
    num_parts   = kv.num_workers,
    part_index  = kv.rank)
val = mx.io.ImageRecordIter(
    path_imglist = args.list_dir + 'celeba_val.lst',
    path_imgrec = args.data_dir + "celeba_val.rec",
    label_width = 40,
    data_name   = 'data',
    label_name  = 'Moon_label',
    batch_size  = args.batch_size,
    data_shape  = (3, 178, 178),
    scale       = 1./255,
    rand_crop   = False,
    rand_mirror = False,
    num_parts   = kv.num_workers,
    part_index  = kv.rank)
model = mx.model.FeedForward(
    ctx                = devs,
    symbol             = symbol,
    arg_params         = arg_params,
    aux_params         = aux_params,
    num_epoch          = 100,
    begin_epoch        = args.model_load_epoch,
    learning_rate      = args.lr,
    momentum           = 0.9,
    wd                 = 0.00001,
    lr_scheduler       = mx.lr_scheduler.FactorScheduler(step=4*max(int(epoch_size * 1), 1), factor=0.8, stop_factor_lr=1e-5),
    initializer        = mx.init.Xavier(factor_type="in", magnitude=2.34)
    )
model.fit(
    X                  = train,
    eval_data          = val,
    eval_metric        = ['multi_binary_acc'],
    kvstore            = kv,
    batch_end_callback = mx.callback.Speedometer(args.batch_size, 10),
    epoch_end_callback = checkpoint)
    # monitor            = mon)`