Closed haithanhp closed 7 years ago
Hi Qikai,
Thank for you kind support. My training does not converge either and the loss does not decrease. I try to prun both kind of layers at once. Here are my prototxt files. You can take a look.
Solver.prototxt:
test_iter: 1000 test_interval: 3000 base_lr: 0.003 lr_policy: "step" gamma: 0.2 stepsize: 15000 display: 10 max_iter: 500000 momentum: 0.9 weight_decay: 0.0005 snapshot: 5000 snapshot_prefix: "/media/Demon/Hai/Dynamic-Network-Surgery/models/alexnet/alexnet_sparse" solver_mode: GPU
train_val.prototxt name: "AlexNet" layer { name: "data" type: "ImageData" top: "data" top: "label" include { phase: TRAIN } transform_param { mirror: true crop_size: 227 mean_value: 104 mean_value: 117 mean_value: 123
} image_data_param { source: "/media/Demon/Hai/Dynamic-Network-Surgery/examples/image_net_train_out.txt" new_height: 256 new_width: 256 batch_size: 256 shuffle: true }
} layer { name: "data" type: "ImageData" top: "data" top: "label" include { phase: TEST } transform_param { mirror: false crop_size: 227 mean_value: 104 mean_value: 117 mean_value: 123
} image_data_param { source: "/media/Demon/Hai/Dynamic-Network-Surgery/examples/image_net_val_out.txt" new_height: 256 new_width: 256 batch_size: 50 shuffle: false }
} layer { name: "conv1" type: "CConvolution" bottom: "data" top: "conv1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 }
convolution_param { num_output: 96 kernel_size: 11 stride: 4 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 }
}
cconvolution_param { gamma: 0.0001 power: 1 c_rate: 18 iter_stop: 15000 weight_mask_filler { type: "constant" value: 1 } bias_mask_filler { type: "constant" value: 1 } } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 5 alpha: 0.0001 beta: 0.75 } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv2" type: "CConvolution" bottom: "pool1" top: "conv2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 2 kernel_size: 5 group: 2 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0.1 }
}
cconvolution_param { gamma: 0.0001 power: 1 c_rate: 18 iter_stop: 15000 weight_mask_filler { type: "constant" value: 1 } bias_mask_filler { type: "constant" value: 1 } } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 5 alpha: 0.0001 beta: 0.75 } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv3" type: "CConvolution" bottom: "pool2" top: "conv3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 384 pad: 1 kernel_size: 3 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 }
}
cconvolution_param { gamma: 0.0001 power: 1 c_rate: 18 iter_stop: 15000 weight_mask_filler { type: "constant" value: 1 } bias_mask_filler { type: "constant" value: 1 } } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "CConvolution" bottom: "conv3" top: "conv4" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 384 pad: 1 kernel_size: 3 group: 2 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0.1 }
}
cconvolution_param { gamma: 0.0001 power: 1 c_rate: 18 iter_stop: 15000 weight_mask_filler { type: "constant" value: 1 } bias_mask_filler { type: "constant" value: 1 } } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "CConvolution" bottom: "conv4" top: "conv5" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 group: 2 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0.1 }
}
cconvolution_param {
gamma: 0.0001
power: 1
c_rate: 18
iter_stop: 15000
weight_mask_filler {
type: "constant"
value: 1
}
bias_mask_filler {
type: "constant"
value: 1
}
}
}
layer {
name: "relu5"
type: "ReLU"
bottom: "conv5"
top: "conv5"
}
layer {
name: "pool5"
type: "Pooling"
bottom: "conv5"
top: "pool5"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "fc6"
type: "CInnerProduct"
bottom: "pool5"
top: "fc6"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 4096
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 0.1
}
}
cinner_product_param {
gamma: 0.0001
power: 1
c_rate: 18
iter_stop: 15000
weight_mask_filler {
type: "constant"
value: 1
}
bias_mask_filler {
type: "constant"
value: 1
}
}
}
layer {
name: "relu6"
type: "ReLU"
bottom: "fc6"
top: "fc6"
}
layer {
name: "drop6"
type: "Dropout"
bottom: "fc6"
top: "fc6"
dropout_param {
dropout_ratio: 0.5
}
}
layer {
name: "fc7"
type: "CInnerProduct"
bottom: "fc6"
top: "fc7"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 4096
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 0.1
}
}
cinner_product_param {
gamma: 0.0001
power: 1
c_rate: 18
iter_stop: 15000
weight_mask_filler {
type: "constant"
value: 1
}
bias_mask_filler {
type: "constant"
value: 1
}
}
}
layer {
name: "relu7"
type: "ReLU"
bottom: "fc7"
top: "fc7"
}
layer {
name: "drop7"
type: "Dropout"
bottom: "fc7"
top: "fc7"
dropout_param {
dropout_ratio: 0.5
}
}
layer {
name: "fc8"
type: "CInnerProduct"
bottom: "fc7"
top: "fc8"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 1000
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
cinner_product_param {
gamma: 0.0001
power: 1
c_rate: 18
iter_stop: 15000
weight_mask_filler {
type: "constant"
value: 1
}
bias_mask_filler {
type: "constant"
value: 1
}
}
}
layer {
name: "accuracy@1"
type: "Accuracy"
bottom: "fc8"
bottom: "label"
top: "accuracy@1"
include {
phase: TEST
}
accuracy_param{
top_k:1
}
}
layer {
name: "accuracy@5"
type:"Accuracy"
bottom:"fc8"
bottom:"label"
top: "accuracy@5"
include{
phase:TEST
}
accuracy_param{
top_k:5
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "fc8"
bottom: "label"
top: "loss"
}
Hi @HaiPhan1991 , I tried to adjust the hyperparameters and it worked in some degree. So I thought things were ok and I deleted some of my comments. But later on it turned out to be still not converging. Here are something I tried:
c_rate
to 1 (or 2), 18 is too large I think.base_lr
to a smaller value, like 1e-4 -- 1e-7 when pruning. iter_stop
should probably be equal to max_iter
, I guess.I also trained both conv layer and ip layer at the same time, which might be a problem. The author trained the two layers seperately.
Please let me know if you find a way to train the model. Thanks!
Right. Eighteen is definitely too large as c_rates. You probably need to use different c_rates (I would say 0~5 would work in most cases) for different layers to obtain good trade-off between model accuracy and compression rate. Also, to deal with the learning problem you've encountered, better use smaller c_rate (negative value can be possible) in the first layer (than those of later layers) to avoid over-pruning and all-zero outputs of the network.
Hi, I try to prun Alex net first layer (c_Rate=2) with base_lr = 1e-7. It seems to converge now. Thank you guys for helpful advices. By the way, how can I do the post process to remove masks in W*T for reducing the storage of models? Do you provide the code to do that?
Hi, thank for your great work. I try to prun Alex net training with Imagenet (learning rate = 0.03). I use compression rate (c_rate = 18) for both convolution and fully connected layers and keep the other parameters same as the examples of your project. I see the size of sparse model increases doubly from 233 MB to 438 MB. Could you help me to explain the procedure? Thank you so much.